diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms')
137 files changed, 19202 insertions, 12192 deletions
diff --git a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index df08091..f9de54a 100644 --- a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -29,7 +29,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "argpromotion" #include "llvm/Transforms/IPO.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Statistic.h" @@ -37,18 +36,22 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include <set> using namespace llvm; +#define DEBUG_TYPE "argpromotion" + STATISTIC(NumArgumentsPromoted , "Number of pointer arguments promoted"); STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted"); STATISTIC(NumByValArgsPromoted , "Number of byval arguments promoted"); @@ -58,29 +61,32 @@ namespace { /// ArgPromotion - The 'by reference' to 'by value' argument promotion pass. /// struct ArgPromotion : public CallGraphSCCPass { - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AliasAnalysis>(); CallGraphSCCPass::getAnalysisUsage(AU); } - virtual bool runOnSCC(CallGraphSCC &SCC); + bool runOnSCC(CallGraphSCC &SCC) override; static char ID; // Pass identification, replacement for typeid explicit ArgPromotion(unsigned maxElements = 3) - : CallGraphSCCPass(ID), maxElements(maxElements) { + : CallGraphSCCPass(ID), DL(nullptr), maxElements(maxElements) { initializeArgPromotionPass(*PassRegistry::getPassRegistry()); } /// A vector used to hold the indices of a single GEP instruction typedef std::vector<uint64_t> IndicesVector; + const DataLayout *DL; private: CallGraphNode *PromoteArguments(CallGraphNode *CGN); bool isSafeToPromoteArgument(Argument *Arg, bool isByVal) const; CallGraphNode *DoPromotion(Function *F, SmallPtrSet<Argument*, 8> &ArgsToPromote, SmallPtrSet<Argument*, 8> &ByValArgsToTransform); + bool doInitialization(CallGraph &CG) override; /// The maximum number of elements to expand, or 0 for unlimited. unsigned maxElements; + DenseMap<const Function *, DISubprogram> FunctionDIs; }; } @@ -88,7 +94,7 @@ char ArgPromotion::ID = 0; INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion", "Promote 'by reference' arguments to scalars", false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_PASS_DEPENDENCY(CallGraph) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) INITIALIZE_PASS_END(ArgPromotion, "argpromotion", "Promote 'by reference' arguments to scalars", false, false) @@ -99,6 +105,9 @@ Pass *llvm::createArgumentPromotionPass(unsigned maxElements) { bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) { bool Changed = false, LocalChange; + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; + do { // Iterate until we stop promoting from this SCC. LocalChange = false; // Attempt to promote arguments from all functions in this SCC. @@ -123,24 +132,23 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { Function *F = CGN->getFunction(); // Make sure that it is local to this module. - if (!F || !F->hasLocalLinkage()) return 0; + if (!F || !F->hasLocalLinkage()) return nullptr; // First check: see if there are any pointer arguments! If not, quick exit. SmallVector<Argument*, 16> PointerArgs; for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I) if (I->getType()->isPointerTy()) PointerArgs.push_back(I); - if (PointerArgs.empty()) return 0; + if (PointerArgs.empty()) return nullptr; // Second check: make sure that all callers are direct callers. We can't // transform functions that have indirect callers. Also see if the function // is self-recursive. bool isSelfRecursive = false; - for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); - UI != E; ++UI) { - CallSite CS(*UI); + for (Use &U : F->uses()) { + CallSite CS(U.getUser()); // Must be a direct call. - if (CS.getInstruction() == 0 || !CS.isCallee(UI)) return 0; + if (CS.getInstruction() == nullptr || !CS.isCallee(&U)) return nullptr; if (CS.getInstruction()->getParent()->getParent() == F) isSelfRecursive = true; @@ -155,7 +163,8 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType(); // If this is a byval argument, and if the aggregate type is small, just - // pass the elements, which is always safe. + // pass the elements, which is always safe. This does not apply to + // inalloca. if (PtrArg->hasByValAttr()) { if (StructType *STy = dyn_cast<StructType>(AgTy)) { if (maxElements > 0 && STy->getNumElements() > maxElements) { @@ -201,32 +210,32 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { } // Otherwise, see if we can promote the pointer to its value. - if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValAttr())) + if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr())) ArgsToPromote.insert(PtrArg); } // No promotable pointer arguments. if (ArgsToPromote.empty() && ByValArgsToTransform.empty()) - return 0; + return nullptr; return DoPromotion(F, ArgsToPromote, ByValArgsToTransform); } /// AllCallersPassInValidPointerForArgument - Return true if we can prove that /// all callees pass in a valid pointer for the specified function argument. -static bool AllCallersPassInValidPointerForArgument(Argument *Arg) { +static bool AllCallersPassInValidPointerForArgument(Argument *Arg, + const DataLayout *DL) { Function *Callee = Arg->getParent(); unsigned ArgNo = Arg->getArgNo(); // Look at all call sites of the function. At this pointer we know we only // have direct callees. - for (Value::use_iterator UI = Callee->use_begin(), E = Callee->use_end(); - UI != E; ++UI) { - CallSite CS(*UI); + for (User *U : Callee->users()) { + CallSite CS(U); assert(CS && "Should only have direct calls!"); - if (!CS.getArgument(ArgNo)->isDereferenceablePointer()) + if (!CS.getArgument(ArgNo)->isDereferenceablePointer(DL)) return false; } return true; @@ -301,7 +310,8 @@ static void MarkIndicesSafe(const ArgPromotion::IndicesVector &ToMark, /// This method limits promotion of aggregates to only promote up to three /// elements of the aggregate in order to avoid exploding the number of /// arguments passed in. -bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, bool isByVal) const { +bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, + bool isByValOrInAlloca) const { typedef std::set<IndicesVector> GEPIndicesSet; // Quick exit for unused arguments @@ -323,6 +333,9 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, bool isByVal) const { // // This set will contain all sets of indices that are loaded in the entry // block, and thus are safe to unconditionally load in the caller. + // + // This optimization is also safe for InAlloca parameters, because it verifies + // that the address isn't captured. GEPIndicesSet SafeToUnconditionallyLoad; // This set contains all the sets of indices that we are planning to promote. @@ -330,7 +343,7 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, bool isByVal) const { GEPIndicesSet ToPromote; // If the pointer is always valid, any load with first index 0 is valid. - if (isByVal || AllCallersPassInValidPointerForArgument(Arg)) + if (isByValOrInAlloca || AllCallersPassInValidPointerForArgument(Arg, DL)) SafeToUnconditionallyLoad.insert(IndicesVector(1, 0)); // First, iterate the entry block and mark loads of (geps of) arguments as @@ -370,17 +383,16 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, bool isByVal) const { // not (GEP+)loads, or any (GEP+)loads that are not safe to promote. SmallVector<LoadInst*, 16> Loads; IndicesVector Operands; - for (Value::use_iterator UI = Arg->use_begin(), E = Arg->use_end(); - UI != E; ++UI) { - User *U = *UI; + for (Use &U : Arg->uses()) { + User *UR = U.getUser(); Operands.clear(); - if (LoadInst *LI = dyn_cast<LoadInst>(U)) { + if (LoadInst *LI = dyn_cast<LoadInst>(UR)) { // Don't hack volatile/atomic loads if (!LI->isSimple()) return false; Loads.push_back(LI); // Direct loads are equivalent to a GEP with a zero index and then a load. Operands.push_back(0); - } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) { + } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UR)) { if (GEP->use_empty()) { // Dead GEP's cause trouble later. Just remove them if we run into // them. @@ -389,7 +401,7 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, bool isByVal) const { // TODO: This runs the above loop over and over again for dead GEPs // Couldn't we just do increment the UI iterator earlier and erase the // use? - return isSafeToPromoteArgument(Arg, isByVal); + return isSafeToPromoteArgument(Arg, isByValOrInAlloca); } // Ensure that all of the indices are constants. @@ -401,9 +413,8 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, bool isByVal) const { return false; // Not a constant operand GEP! // Ensure that the only users of the GEP are load instructions. - for (Value::use_iterator UI = GEP->use_begin(), E = GEP->use_end(); - UI != E; ++UI) - if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) { + for (User *GEPU : GEP->users()) + if (LoadInst *LI = dyn_cast<LoadInst>(GEPU)) { // Don't hack volatile/atomic loads if (!LI->isSimple()) return false; Loads.push_back(LI); @@ -549,16 +560,15 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // In this table, we will track which indices are loaded from the argument // (where direct loads are tracked as no indices). ScalarizeTable &ArgIndices = ScalarizedElements[I]; - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E; - ++UI) { - Instruction *User = cast<Instruction>(*UI); - assert(isa<LoadInst>(User) || isa<GetElementPtrInst>(User)); + for (User *U : I->users()) { + Instruction *UI = cast<Instruction>(U); + assert(isa<LoadInst>(UI) || isa<GetElementPtrInst>(UI)); IndicesVector Indices; - Indices.reserve(User->getNumOperands() - 1); + Indices.reserve(UI->getNumOperands() - 1); // Since loads will only have a single operand, and GEPs only a single // non-index operand, this will record direct loads without any indices, // and gep+loads with the GEP indices. - for (User::op_iterator II = User->op_begin() + 1, IE = User->op_end(); + for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end(); II != IE; ++II) Indices.push_back(cast<ConstantInt>(*II)->getSExtValue()); // GEPs with a single 0 index can be merged with direct loads @@ -566,11 +576,11 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, Indices.clear(); ArgIndices.insert(Indices); LoadInst *OrigLoad; - if (LoadInst *L = dyn_cast<LoadInst>(User)) + if (LoadInst *L = dyn_cast<LoadInst>(UI)) OrigLoad = L; else // Take any load, we will use it only to update Alias Analysis - OrigLoad = cast<LoadInst>(User->use_back()); + OrigLoad = cast<LoadInst>(UI->user_back()); OriginalLoads[std::make_pair(I, Indices)] = OrigLoad; } @@ -603,6 +613,10 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName()); NF->copyAttributesFrom(F); + // Patch the pointer to LLVM function in debug info descriptor. + auto DI = FunctionDIs.find(F); + if (DI != FunctionDIs.end()) + DI->second.replaceFunction(NF); DEBUG(dbgs() << "ARG PROMOTION: Promoting to:" << *NF << "\n" << "From: " << *F); @@ -621,8 +635,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // Get the callgraph information that we need to update to reflect our // changes. - CallGraph &CG = getAnalysis<CallGraph>(); - + CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); + // Get a new callgraph node for NF. CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF); @@ -631,7 +645,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // SmallVector<Value*, 16> Args; while (!F->use_empty()) { - CallSite CS(F->use_back()); + CallSite CS(F->user_back()); assert(CS.getCalledFunction() == F); Instruction *Call = CS.getInstruction(); const AttributeSet &CallPAL = CS.getAttributes(); @@ -660,7 +674,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, Type *AgTy = cast<PointerType>(I->getType())->getElementType(); StructType *STy = cast<StructType>(AgTy); Value *Idxs[2] = { - ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), 0 }; + ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr }; for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i); Value *Idx = GetElementPtrInst::Create(*AI, Idxs, @@ -740,6 +754,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, if (cast<CallInst>(Call)->isTailCall()) cast<CallInst>(New)->setTailCall(); } + New->setDebugLoc(Call->getDebugLoc()); Args.clear(); AttributesVec.clear(); @@ -788,10 +803,10 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // Just add all the struct element types. Type *AgTy = cast<PointerType>(I->getType())->getElementType(); - Value *TheAlloca = new AllocaInst(AgTy, 0, "", InsertPt); + Value *TheAlloca = new AllocaInst(AgTy, nullptr, "", InsertPt); StructType *STy = cast<StructType>(AgTy); Value *Idxs[2] = { - ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), 0 }; + ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr }; for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i); @@ -807,6 +822,15 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, I->replaceAllUsesWith(TheAlloca); TheAlloca->takeName(I); AA.replaceWithNewValue(I, TheAlloca); + + // If the alloca is used in a call, we must clear the tail flag since + // the callee now uses an alloca from the caller. + for (User *U : TheAlloca->users()) { + CallInst *Call = dyn_cast<CallInst>(U); + if (!Call) + continue; + Call->setTailCall(false); + } continue; } @@ -821,7 +845,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, ScalarizeTable &ArgIndices = ScalarizedElements[I]; while (!I->use_empty()) { - if (LoadInst *LI = dyn_cast<LoadInst>(I->use_back())) { + if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) { assert(ArgIndices.begin()->empty() && "Load element should sort to front!"); I2->setName(I->getName()+".val"); @@ -831,7 +855,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName() << "' in function '" << F->getName() << "'\n"); } else { - GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->use_back()); + GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back()); IndicesVector Operands; Operands.reserve(GEP->getNumIndices()); for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end(); @@ -861,7 +885,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // All of the uses must be load instructions. Replace them all with // the argument specified by ArgNo. while (!GEP->use_empty()) { - LoadInst *L = cast<LoadInst>(GEP->use_back()); + LoadInst *L = cast<LoadInst>(GEP->user_back()); L->replaceAllUsesWith(TheArg); AA.replaceWithNewValue(L, TheArg); L->eraseFromParent(); @@ -892,3 +916,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, return NF_CGN; } + +bool ArgPromotion::doInitialization(CallGraph &CG) { + FunctionDIs = makeSubprogramMap(CG.getModule()); + return CallGraphSCCPass::doInitialization(CG); +} diff --git a/contrib/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp b/contrib/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp index 2e32240..6af1043 100644 --- a/contrib/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp +++ b/contrib/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp @@ -36,7 +36,7 @@ public: initializeBarrierNoopPass(*PassRegistry::getPassRegistry()); } - bool runOnModule(Module &M) { return false; } + bool runOnModule(Module &M) override { return false; } }; } diff --git a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp index d94c0f4..23be081 100644 --- a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp @@ -17,7 +17,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "constmerge" #include "llvm/Transforms/IPO.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PointerIntPair.h" @@ -31,6 +30,8 @@ #include "llvm/Pass.h" using namespace llvm; +#define DEBUG_TYPE "constmerge" + STATISTIC(NumMerged, "Number of global constants merged"); namespace { @@ -42,7 +43,7 @@ namespace { // For this pass, process all of the globals in the module, eliminating // duplicate constants. - bool runOnModule(Module &M); + bool runOnModule(Module &M) override; // Return true iff we can determine the alignment of this global variable. bool hasKnownAlignment(GlobalVariable *GV) const; @@ -51,7 +52,7 @@ namespace { // alignment to a concrete value. unsigned getAlignment(GlobalVariable *GV) const; - const DataLayout *TD; + const DataLayout *DL; }; } @@ -66,7 +67,7 @@ ModulePass *llvm::createConstantMergePass() { return new ConstantMerge(); } /// Find values that are marked as llvm.used. static void FindUsedValues(GlobalVariable *LLVMUsed, SmallPtrSet<const GlobalValue*, 8> &UsedValues) { - if (LLVMUsed == 0) return; + if (!LLVMUsed) return; ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer()); for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) { @@ -77,8 +78,8 @@ static void FindUsedValues(GlobalVariable *LLVMUsed, } // True if A is better than B. -static bool IsBetterCannonical(const GlobalVariable &A, - const GlobalVariable &B) { +static bool IsBetterCanonical(const GlobalVariable &A, + const GlobalVariable &B) { if (!A.hasLocalLinkage() && B.hasLocalLinkage()) return true; @@ -89,20 +90,21 @@ static bool IsBetterCannonical(const GlobalVariable &A, } bool ConstantMerge::hasKnownAlignment(GlobalVariable *GV) const { - return TD || GV->getAlignment() != 0; + return DL || GV->getAlignment() != 0; } unsigned ConstantMerge::getAlignment(GlobalVariable *GV) const { unsigned Align = GV->getAlignment(); if (Align) return Align; - if (TD) - return TD->getPreferredAlignment(GV); + if (DL) + return DL->getPreferredAlignment(GV); return 0; } bool ConstantMerge::runOnModule(Module &M) { - TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; // Find all the globals that are marked "used". These cannot be merged. SmallPtrSet<const GlobalValue*, 8> UsedGlobals; @@ -160,7 +162,7 @@ bool ConstantMerge::runOnModule(Module &M) { // If this is the first constant we find or if the old one is local, // replace with the current one. If the current is externally visible // it cannot be replace, but can be the canonical constant we merge with. - if (Slot == 0 || IsBetterCannonical(*GV, *Slot)) + if (!Slot || IsBetterCanonical(*GV, *Slot)) Slot = GV; } diff --git a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index 911c14e..ac3853d 100644 --- a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -17,29 +17,31 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "deadargelim" #include "llvm/Transforms/IPO.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/DIBuilder.h" -#include "llvm/DebugInfo.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constant.h" +#include "llvm/IR/DIBuilder.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" -#include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include <map> #include <set> +#include <tuple> using namespace llvm; +#define DEBUG_TYPE "deadargelim" + STATISTIC(NumArgumentsEliminated, "Number of unread args removed"); STATISTIC(NumRetValsEliminated , "Number of unused return values removed"); STATISTIC(NumArgumentsReplacedWithUndef, @@ -62,12 +64,7 @@ namespace { /// Make RetOrArg comparable, so we can put it into a map. bool operator<(const RetOrArg &O) const { - if (F != O.F) - return F < O.F; - else if (Idx != O.Idx) - return Idx < O.Idx; - else - return IsArg < O.IsArg; + return std::tie(F, Idx, IsArg) < std::tie(O.F, O.Idx, O.IsArg); } /// Make RetOrArg comparable, so we can easily iterate the multimap. @@ -130,8 +127,7 @@ namespace { // As the code generation for module is finished (and DIBuilder is // finalized) we assume that subprogram descriptors won't be changed, and // they are stored in map for short duration anyway. - typedef DenseMap<Function*, DISubprogram> FunctionDIMap; - FunctionDIMap FunctionDIs; + DenseMap<const Function *, DISubprogram> FunctionDIs; protected: // DAH uses this to specify a different ID. @@ -143,17 +139,16 @@ namespace { initializeDAEPass(*PassRegistry::getPassRegistry()); } - bool runOnModule(Module &M); + bool runOnModule(Module &M) override; virtual bool ShouldHackArguments() const { return false; } private: Liveness MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses); - Liveness SurveyUse(Value::const_use_iterator U, UseVector &MaybeLiveUses, + Liveness SurveyUse(const Use *U, UseVector &MaybeLiveUses, unsigned RetValNum = 0); Liveness SurveyUses(const Value *V, UseVector &MaybeLiveUses); - void CollectFunctionDIs(Module &M); void SurveyFunction(const Function &F); void MarkValue(const RetOrArg &RA, Liveness L, const UseVector &MaybeLiveUses); @@ -178,7 +173,7 @@ namespace { static char ID; DAH() : DAE(ID) {} - virtual bool ShouldHackArguments() const { return true; } + bool ShouldHackArguments() const override { return true; } }; } @@ -193,35 +188,6 @@ INITIALIZE_PASS(DAH, "deadarghaX0r", ModulePass *llvm::createDeadArgEliminationPass() { return new DAE(); } ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); } -/// CollectFunctionDIs - Map each function in the module to its debug info -/// descriptor. -void DAE::CollectFunctionDIs(Module &M) { - FunctionDIs.clear(); - - for (Module::named_metadata_iterator I = M.named_metadata_begin(), - E = M.named_metadata_end(); I != E; ++I) { - NamedMDNode &NMD = *I; - for (unsigned MDIndex = 0, MDNum = NMD.getNumOperands(); - MDIndex < MDNum; ++MDIndex) { - MDNode *Node = NMD.getOperand(MDIndex); - if (!DIDescriptor(Node).isCompileUnit()) - continue; - DICompileUnit CU(Node); - const DIArray &SPs = CU.getSubprograms(); - for (unsigned SPIndex = 0, SPNum = SPs.getNumElements(); - SPIndex < SPNum; ++SPIndex) { - DISubprogram SP(SPs.getElement(SPIndex)); - assert((!SP || SP.isSubprogram()) && - "A MDNode in subprograms of a CU should be null or a DISubprogram."); - if (!SP) - continue; - if (Function *F = SP.getFunction()) - FunctionDIs[F] = SP; - } - } - } -} - /// DeleteDeadVarargs - If this is an function that takes a ... list, and if /// llvm.vastart is never called, the varargs list is dead for the function. bool DAE::DeleteDeadVarargs(Function &Fn) { @@ -265,7 +231,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { // to pass in a smaller number of arguments into the new function. // std::vector<Value*> Args; - for (Value::use_iterator I = Fn.use_begin(), E = Fn.use_end(); I != E; ) { + for (Value::user_iterator I = Fn.user_begin(), E = Fn.user_end(); I != E; ) { CallSite CS(*I++); if (!CS) continue; @@ -330,7 +296,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { } // Patch the pointer to LLVM function in debug info descriptor. - FunctionDIMap::iterator DI = FunctionDIs.find(&Fn); + auto DI = FunctionDIs.find(&Fn); if (DI != FunctionDIs.end()) DI->second.replaceFunction(NF); @@ -378,7 +344,7 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn) I != E; ++I) { Argument *Arg = I; - if (Arg->use_empty() && !Arg->hasByValAttr()) + if (Arg->use_empty() && !Arg->hasByValOrInAllocaAttr()) UnusedArgs.push_back(Arg->getArgNo()); } @@ -387,10 +353,9 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn) bool Changed = false; - for (Function::use_iterator I = Fn.use_begin(), E = Fn.use_end(); - I != E; ++I) { - CallSite CS(*I); - if (!CS || !CS.isCallee(I)) + for (Use &U : Fn.uses()) { + CallSite CS(U.getUser()); + if (!CS || !CS.isCallee(&U)) continue; // Now go through all unused args and replace them with "undef". @@ -441,9 +406,9 @@ DAE::Liveness DAE::MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses) { /// RetValNum is the return value number to use when this use is used in a /// return instruction. This is used in the recursion, you should always leave /// it at 0. -DAE::Liveness DAE::SurveyUse(Value::const_use_iterator U, +DAE::Liveness DAE::SurveyUse(const Use *U, UseVector &MaybeLiveUses, unsigned RetValNum) { - const User *V = *U; + const User *V = U->getUser(); if (const ReturnInst *RI = dyn_cast<ReturnInst>(V)) { // The value is returned from a function. It's only live when the // function's return value is live. We use RetValNum here, for the case @@ -454,7 +419,7 @@ DAE::Liveness DAE::SurveyUse(Value::const_use_iterator U, return MarkIfNotLive(Use, MaybeLiveUses); } if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(V)) { - if (U.getOperandNo() != InsertValueInst::getAggregateOperandIndex() + if (U->getOperandNo() != InsertValueInst::getAggregateOperandIndex() && IV->hasIndices()) // The use we are examining is inserted into an aggregate. Our liveness // depends on all uses of that aggregate, but if it is used as a return @@ -465,9 +430,8 @@ DAE::Liveness DAE::SurveyUse(Value::const_use_iterator U, // we don't change RetValNum, but do survey all our uses. Liveness Result = MaybeLive; - for (Value::const_use_iterator I = IV->use_begin(), - E = V->use_end(); I != E; ++I) { - Result = SurveyUse(I, MaybeLiveUses, RetValNum); + for (const Use &UU : IV->uses()) { + Result = SurveyUse(&UU, MaybeLiveUses, RetValNum); if (Result == Live) break; } @@ -490,7 +454,7 @@ DAE::Liveness DAE::SurveyUse(Value::const_use_iterator U, return Live; assert(CS.getArgument(ArgNo) - == CS->getOperand(U.getOperandNo()) + == CS->getOperand(U->getOperandNo()) && "Argument is not where we expected it"); // Value passed to a normal call. It's only live when the corresponding @@ -513,9 +477,8 @@ DAE::Liveness DAE::SurveyUses(const Value *V, UseVector &MaybeLiveUses) { // Assume it's dead (which will only hold if there are no uses at all..). Liveness Result = MaybeLive; // Check each use. - for (Value::const_use_iterator I = V->use_begin(), - E = V->use_end(); I != E; ++I) { - Result = SurveyUse(I, MaybeLiveUses); + for (const Use &U : V->uses()) { + Result = SurveyUse(&U, MaybeLiveUses); if (Result == Live) break; } @@ -531,6 +494,13 @@ DAE::Liveness DAE::SurveyUses(const Value *V, UseVector &MaybeLiveUses) { // well as arguments to functions which have their "address taken". // void DAE::SurveyFunction(const Function &F) { + // Functions with inalloca parameters are expecting args in a particular + // register and memory layout. + if (F.getAttributes().hasAttrSomewhere(Attribute::InAlloca)) { + MarkLive(F); + return; + } + unsigned RetCount = NumRetVals(&F); // Assume all return values are dead typedef SmallVector<Liveness, 5> RetVals; @@ -562,12 +532,11 @@ void DAE::SurveyFunction(const Function &F) { unsigned NumLiveRetVals = 0; Type *STy = dyn_cast<StructType>(F.getReturnType()); // Loop all uses of the function. - for (Value::const_use_iterator I = F.use_begin(), E = F.use_end(); - I != E; ++I) { + for (const Use &U : F.uses()) { // If the function is PASSED IN as an argument, its address has been // taken. - ImmutableCallSite CS(*I); - if (!CS || !CS.isCallee(I)) { + ImmutableCallSite CS(U.getUser()); + if (!CS || !CS.isCallee(&U)) { MarkLive(F); return; } @@ -586,9 +555,8 @@ void DAE::SurveyFunction(const Function &F) { if (NumLiveRetVals != RetCount) { if (STy) { // Check all uses of the return value. - for (Value::const_use_iterator I = TheCall->use_begin(), - E = TheCall->use_end(); I != E; ++I) { - const ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(*I); + for (const User *U : TheCall->users()) { + const ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(U); if (Ext && Ext->hasIndices()) { // This use uses a part of our return value, survey the uses of // that part and store the results for this index only. @@ -767,7 +735,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { // Find out the new return value. Type *RetTy = FTy->getReturnType(); - Type *NRetTy = NULL; + Type *NRetTy = nullptr; unsigned RetCount = NumRetVals(F); // -1 means unused, other numbers are the new index @@ -891,7 +859,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { // std::vector<Value*> Args; while (!F->use_empty()) { - CallSite CS(F->use_back()); + CallSite CS(F->user_back()); Instruction *Call = CS.getInstruction(); AttributesVec.clear(); @@ -1053,7 +1021,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { Value *RetVal; if (NFTy->getReturnType()->isVoidTy()) { - RetVal = 0; + RetVal = nullptr; } else { assert (RetTy->isStructTy()); // The original return value was a struct, insert @@ -1088,7 +1056,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { } // Patch the pointer to LLVM function in debug info descriptor. - FunctionDIMap::iterator DI = FunctionDIs.find(F); + auto DI = FunctionDIs.find(F); if (DI != FunctionDIs.end()) DI->second.replaceFunction(NF); @@ -1102,7 +1070,7 @@ bool DAE::runOnModule(Module &M) { bool Changed = false; // Collect debug info descriptors for functions. - CollectFunctionDIs(M); + FunctionDIs = makeSubprogramMap(M); // First pass: Do a simple check to see if any functions can have their "..." // removed. We can do this if they never call va_start. This loop cannot be diff --git a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp index 50fb3e6..40ec9fa 100644 --- a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp @@ -27,11 +27,10 @@ using namespace llvm; /// the split module remain valid. static void makeVisible(GlobalValue &GV, bool Delete) { bool Local = GV.hasLocalLinkage(); - if (Local) - GV.setVisibility(GlobalValue::HiddenVisibility); - if (Local || Delete) { GV.setLinkage(GlobalValue::ExternalLinkage); + if (Local) + GV.setVisibility(GlobalValue::HiddenVisibility); return; } @@ -68,7 +67,7 @@ namespace { explicit GVExtractorPass(std::vector<GlobalValue*>& GVs, bool deleteS = true) : ModulePass(ID), Named(GVs.begin(), GVs.end()), deleteStuff(deleteS) {} - bool runOnModule(Module &M) { + bool runOnModule(Module &M) override { // Visit the global inline asm. if (!deleteStuff) M.setModuleInlineAsm(""); @@ -95,7 +94,7 @@ namespace { makeVisible(*I, Delete); if (Delete) - I->setInitializer(0); + I->setInitializer(nullptr); } // Visit the Functions. @@ -134,7 +133,7 @@ namespace { } else { Declaration = new GlobalVariable(M, Ty, false, GlobalValue::ExternalLinkage, - 0, CurI->getName()); + nullptr, CurI->getName()); } CurI->replaceAllUsesWith(Declaration); diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 60e5f06..8174df9 100644 --- a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -18,7 +18,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "functionattrs" #include "llvm/Transforms/IPO.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SetVector.h" @@ -29,12 +28,14 @@ #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/Support/InstIterator.h" #include "llvm/Target/TargetLibraryInfo.h" using namespace llvm; +#define DEBUG_TYPE "functionattrs" + STATISTIC(NumReadNone, "Number of functions marked readnone"); STATISTIC(NumReadOnly, "Number of functions marked readonly"); STATISTIC(NumNoCapture, "Number of arguments marked nocapture"); @@ -46,12 +47,12 @@ STATISTIC(NumAnnotated, "Number of attributes added to library functions"); namespace { struct FunctionAttrs : public CallGraphSCCPass { static char ID; // Pass identification, replacement for typeid - FunctionAttrs() : CallGraphSCCPass(ID), AA(0) { + FunctionAttrs() : CallGraphSCCPass(ID), AA(nullptr) { initializeFunctionAttrsPass(*PassRegistry::getPassRegistry()); } // runOnSCC - Analyze the SCC, performing the transformation if possible. - bool runOnSCC(CallGraphSCC &SCC); + bool runOnSCC(CallGraphSCC &SCC) override; // AddReadAttrs - Deduce readonly/readnone attributes for the SCC. bool AddReadAttrs(const CallGraphSCC &SCC); @@ -120,7 +121,7 @@ namespace { // call declarations. bool annotateLibraryCalls(const CallGraphSCC &SCC); - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<AliasAnalysis>(); AU.addRequired<TargetLibraryInfo>(); @@ -137,7 +138,7 @@ char FunctionAttrs::ID = 0; INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs", "Deduce function attributes", false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_PASS_DEPENDENCY(CallGraph) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_PASS_END(FunctionAttrs, "functionattrs", "Deduce function attributes", false, false) @@ -160,7 +161,7 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) { for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { Function *F = (*I)->getFunction(); - if (F == 0) + if (!F) // External node - may write memory. Just give up. return false; @@ -319,7 +320,7 @@ namespace { ArgumentGraphNode SyntheticRoot; public: - ArgumentGraph() { SyntheticRoot.Definition = 0; } + ArgumentGraph() { SyntheticRoot.Definition = nullptr; } typedef SmallVectorImpl<ArgumentGraphNode*>::iterator iterator; @@ -342,9 +343,9 @@ namespace { ArgumentUsesTracker(const SmallPtrSet<Function*, 8> &SCCNodes) : Captured(false), SCCNodes(SCCNodes) {} - void tooManyUses() { Captured = true; } + void tooManyUses() override { Captured = true; } - bool captured(Use *U) { + bool captured(const Use *U) override { CallSite CS(U->getUser()); if (!CS.getInstruction()) { Captured = true; return true; } @@ -414,17 +415,19 @@ determinePointerReadAttrs(Argument *A, SmallSet<Use*, 32> Visited; int Count = 0; + // inalloca arguments are always clobbered by the call. + if (A->hasInAllocaAttr()) + return Attribute::None; + bool IsRead = false; // We don't need to track IsWritten. If A is written to, return immediately. - for (Value::use_iterator UI = A->use_begin(), UE = A->use_end(); - UI != UE; ++UI) { + for (Use &U : A->uses()) { if (Count++ >= 20) return Attribute::None; - Use *U = &UI.getUse(); - Visited.insert(U); - Worklist.push_back(U); + Visited.insert(&U); + Worklist.push_back(&U); } while (!Worklist.empty()) { @@ -437,25 +440,38 @@ determinePointerReadAttrs(Argument *A, case Instruction::GetElementPtr: case Instruction::PHI: case Instruction::Select: + case Instruction::AddrSpaceCast: // The original value is not read/written via this if the new value isn't. - for (Instruction::use_iterator UI = I->use_begin(), UE = I->use_end(); - UI != UE; ++UI) { - Use *U = &UI.getUse(); - if (Visited.insert(U)) - Worklist.push_back(U); - } + for (Use &UU : I->uses()) + if (Visited.insert(&UU)) + Worklist.push_back(&UU); break; case Instruction::Call: case Instruction::Invoke: { + bool Captures = true; + + if (I->getType()->isVoidTy()) + Captures = false; + + auto AddUsersToWorklistIfCapturing = [&] { + if (Captures) + for (Use &UU : I->uses()) + if (Visited.insert(&UU)) + Worklist.push_back(&UU); + }; + CallSite CS(I); - if (CS.doesNotAccessMemory()) + if (CS.doesNotAccessMemory()) { + AddUsersToWorklistIfCapturing(); continue; + } Function *F = CS.getCalledFunction(); if (!F) { if (CS.onlyReadsMemory()) { IsRead = true; + AddUsersToWorklistIfCapturing(); continue; } return Attribute::None; @@ -470,6 +486,7 @@ determinePointerReadAttrs(Argument *A, "More params than args in non-varargs call."); return Attribute::None; } + Captures &= !CS.doesNotCapture(A - B); if (SCCNodes.count(AI)) continue; if (!CS.onlyReadsMemory() && !CS.onlyReadsMemory(A - B)) @@ -478,6 +495,7 @@ determinePointerReadAttrs(Argument *A, IsRead = true; } } + AddUsersToWorklistIfCapturing(); break; } @@ -521,7 +539,7 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { Function *F = (*I)->getFunction(); - if (F == 0) + if (!F) // External node - only a problem for arguments that we pass to it. continue; @@ -599,9 +617,8 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { // made. If the definition doesn't have a 'nocapture' attribute by now, it // captures. - for (scc_iterator<ArgumentGraph*> I = scc_begin(&AG), E = scc_end(&AG); - I != E; ++I) { - std::vector<ArgumentGraphNode*> &ArgumentSCC = *I; + for (scc_iterator<ArgumentGraph*> I = scc_begin(&AG); !I.isAtEnd(); ++I) { + const std::vector<ArgumentGraphNode *> &ArgumentSCC = *I; if (ArgumentSCC.size() == 1) { if (!ArgumentSCC[0]->Definition) continue; // synthetic root node @@ -617,8 +634,8 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { } bool SCCCaptured = false; - for (std::vector<ArgumentGraphNode*>::iterator I = ArgumentSCC.begin(), - E = ArgumentSCC.end(); I != E && !SCCCaptured; ++I) { + for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end(); + I != E && !SCCCaptured; ++I) { ArgumentGraphNode *Node = *I; if (Node->Uses.empty()) { if (!Node->Definition->hasNoCaptureAttr()) @@ -630,13 +647,12 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { SmallPtrSet<Argument*, 8> ArgumentSCCNodes; // Fill ArgumentSCCNodes with the elements of the ArgumentSCC. Used for // quickly looking up whether a given Argument is in this ArgumentSCC. - for (std::vector<ArgumentGraphNode*>::iterator I = ArgumentSCC.begin(), - E = ArgumentSCC.end(); I != E; ++I) { + for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end(); I != E; ++I) { ArgumentSCCNodes.insert((*I)->Definition); } - for (std::vector<ArgumentGraphNode*>::iterator I = ArgumentSCC.begin(), - E = ArgumentSCC.end(); I != E && !SCCCaptured; ++I) { + for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end(); + I != E && !SCCCaptured; ++I) { ArgumentGraphNode *N = *I; for (SmallVectorImpl<ArgumentGraphNode*>::iterator UI = N->Uses.begin(), UE = N->Uses.end(); UI != UE; ++UI) { @@ -723,6 +739,7 @@ bool FunctionAttrs::IsFunctionMallocLike(Function *F, // Extend the analysis by looking upwards. case Instruction::BitCast: case Instruction::GetElementPtr: + case Instruction::AddrSpaceCast: FlowsToReturn.insert(RVI->getOperand(0)); continue; case Instruction::Select: { @@ -775,7 +792,7 @@ bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) { for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { Function *F = (*I)->getFunction(); - if (F == 0) + if (!F) // External node - skip it; return false; @@ -1649,6 +1666,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotCapture(F, 1); setDoesNotCapture(F, 2); + break; default: // Didn't mark any attributes. return false; @@ -1667,7 +1685,7 @@ bool FunctionAttrs::annotateLibraryCalls(const CallGraphSCC &SCC) { for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { Function *F = (*I)->getFunction(); - if (F != 0 && F->isDeclaration()) + if (F && F->isDeclaration()) MadeChange |= inferPrototypeAttributes(*F); } diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp index 901295d..7e7a4c0 100644 --- a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp +++ b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp @@ -15,15 +15,18 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "globaldce" #include "llvm/Transforms/IPO.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/CtorUtils.h" #include "llvm/Pass.h" using namespace llvm; +#define DEBUG_TYPE "globaldce" + STATISTIC(NumAliases , "Number of global aliases removed"); STATISTIC(NumFunctions, "Number of functions removed"); STATISTIC(NumVariables, "Number of global variables removed"); @@ -38,7 +41,7 @@ namespace { // run - Do the GlobalDCE pass on the specified module, optionally updating // the specified callgraph to reflect the changes. // - bool runOnModule(Module &M); + bool runOnModule(Module &M) override; private: SmallPtrSet<GlobalValue*, 32> AliveGlobals; @@ -53,6 +56,15 @@ namespace { }; } +/// Returns true if F contains only a single "ret" instruction. +static bool isEmptyFunction(Function *F) { + BasicBlock &Entry = F->getEntryBlock(); + if (Entry.size() != 1 || !isa<ReturnInst>(Entry.front())) + return false; + ReturnInst &RI = cast<ReturnInst>(Entry.front()); + return RI.getReturnValue() == nullptr; +} + char GlobalDCE::ID = 0; INITIALIZE_PASS(GlobalDCE, "globaldce", "Dead Global Elimination", false, false) @@ -61,14 +73,23 @@ ModulePass *llvm::createGlobalDCEPass() { return new GlobalDCE(); } bool GlobalDCE::runOnModule(Module &M) { bool Changed = false; - + + // Remove empty functions from the global ctors list. + Changed |= optimizeGlobalCtorsList(M, isEmptyFunction); + + typedef std::multimap<const Comdat *, GlobalValue *> ComdatGVPairsTy; + ComdatGVPairsTy ComdatGVPairs; + // Loop over the module, adding globals which are obviously necessary. for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { Changed |= RemoveUnusedGlobalValue(*I); // Functions with external linkage are needed if they have a body - if (!I->isDiscardableIfUnused() && - !I->isDeclaration() && !I->hasAvailableExternallyLinkage()) - GlobalIsNeeded(I); + if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) { + if (!I->isDiscardableIfUnused()) + GlobalIsNeeded(I); + else if (const Comdat *C = I->getComdat()) + ComdatGVPairs.insert(std::make_pair(C, I)); + } } for (Module::global_iterator I = M.global_begin(), E = M.global_end(); @@ -76,17 +97,38 @@ bool GlobalDCE::runOnModule(Module &M) { Changed |= RemoveUnusedGlobalValue(*I); // Externally visible & appending globals are needed, if they have an // initializer. - if (!I->isDiscardableIfUnused() && - !I->isDeclaration() && !I->hasAvailableExternallyLinkage()) - GlobalIsNeeded(I); + if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) { + if (!I->isDiscardableIfUnused()) + GlobalIsNeeded(I); + else if (const Comdat *C = I->getComdat()) + ComdatGVPairs.insert(std::make_pair(C, I)); + } } for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E; ++I) { Changed |= RemoveUnusedGlobalValue(*I); // Externally visible aliases are needed. - if (!I->isDiscardableIfUnused()) + if (!I->isDiscardableIfUnused()) { GlobalIsNeeded(I); + } else if (const Comdat *C = I->getComdat()) { + ComdatGVPairs.insert(std::make_pair(C, I)); + } + } + + for (ComdatGVPairsTy::iterator I = ComdatGVPairs.begin(), + E = ComdatGVPairs.end(); + I != E;) { + ComdatGVPairsTy::iterator UB = ComdatGVPairs.upper_bound(I->first); + bool CanDiscard = std::all_of(I, UB, [](ComdatGVPairsTy::value_type Pair) { + return Pair.second->isDiscardableIfUnused(); + }); + if (!CanDiscard) { + std::for_each(I, UB, [this](ComdatGVPairsTy::value_type Pair) { + GlobalIsNeeded(Pair.second); + }); + } + I = UB; } // Now that all globals which are needed are in the AliveGlobals set, we loop @@ -99,7 +141,7 @@ bool GlobalDCE::runOnModule(Module &M) { I != E; ++I) if (!AliveGlobals.count(I)) { DeadGlobalVars.push_back(I); // Keep track of dead globals - I->setInitializer(0); + I->setInitializer(nullptr); } // The second pass drops the bodies of functions which are dead... @@ -117,7 +159,7 @@ bool GlobalDCE::runOnModule(Module &M) { ++I) if (!AliveGlobals.count(I)) { DeadAliases.push_back(I); - I->setAliasee(0); + I->setAliasee(nullptr); } if (!DeadFunctions.empty()) { diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 2ea89a1..c1d0d3bc 100644 --- a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -13,37 +13,41 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "globalopt" #include "llvm/Transforms/IPO.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" -#include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Support/ValueHandle.h" #include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/CtorUtils.h" #include "llvm/Transforms/Utils/GlobalStatus.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include <algorithm> +#include <deque> using namespace llvm; +#define DEBUG_TYPE "globalopt" + STATISTIC(NumMarked , "Number of globals marked constant"); STATISTIC(NumUnnamed , "Number of globals marked unnamed_addr"); STATISTIC(NumSRA , "Number of aggregate globals broken into scalars"); @@ -63,7 +67,7 @@ STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed"); namespace { struct GlobalOpt : public ModulePass { - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetLibraryInfo>(); } static char ID; // Pass identification, replacement for typeid @@ -71,20 +75,18 @@ namespace { initializeGlobalOptPass(*PassRegistry::getPassRegistry()); } - bool runOnModule(Module &M); + bool runOnModule(Module &M) override; private: - GlobalVariable *FindGlobalCtors(Module &M); bool OptimizeFunctions(Module &M); bool OptimizeGlobalVars(Module &M); bool OptimizeGlobalAliases(Module &M); - bool OptimizeGlobalCtorsList(GlobalVariable *&GCL); bool ProcessGlobal(GlobalVariable *GV,Module::global_iterator &GVI); bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI, const GlobalStatus &GS); bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn); - DataLayout *TD; + const DataLayout *DL; TargetLibraryInfo *TLI; }; } @@ -196,7 +198,7 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV, SmallVector<std::pair<Instruction *, Instruction *>, 32> Dead; // Constants can't be pointers to dynamically allocated memory. - for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); + for (Value::user_iterator UI = GV->user_begin(), E = GV->user_end(); UI != E;) { User *U = *UI++; if (StoreInst *SI = dyn_cast<StoreInst>(U)) { @@ -266,13 +268,14 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV, /// quick scan over the use list to clean up the easy and obvious cruft. This /// returns true if it made a change. static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, - DataLayout *TD, TargetLibraryInfo *TLI) { + const DataLayout *DL, + TargetLibraryInfo *TLI) { bool Changed = false; // Note that we need to use a weak value handle for the worklist items. When // we delete a constant array, we may also be holding pointer to one of its // elements (or an element of one of its elements if we're dealing with an // array of arrays) in the worklist. - SmallVector<WeakVH, 8> WorkList(V->use_begin(), V->use_end()); + SmallVector<WeakVH, 8> WorkList(V->user_begin(), V->user_end()); while (!WorkList.empty()) { Value *UV = WorkList.pop_back_val(); if (!UV) @@ -293,14 +296,15 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, Changed = true; } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) { if (CE->getOpcode() == Instruction::GetElementPtr) { - Constant *SubInit = 0; + Constant *SubInit = nullptr; if (Init) SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE); - Changed |= CleanupConstantGlobalUsers(CE, SubInit, TD, TLI); - } else if (CE->getOpcode() == Instruction::BitCast && - CE->getType()->isPointerTy()) { + Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, TLI); + } else if ((CE->getOpcode() == Instruction::BitCast && + CE->getType()->isPointerTy()) || + CE->getOpcode() == Instruction::AddrSpaceCast) { // Pointer cast, delete any stores and memsets to the global. - Changed |= CleanupConstantGlobalUsers(CE, 0, TD, TLI); + Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, TLI); } if (CE->use_empty()) { @@ -311,10 +315,10 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, // Do not transform "gepinst (gep constexpr (GV))" here, because forming // "gepconstexpr (gep constexpr (GV))" will cause the two gep's to fold // and will invalidate our notion of what Init is. - Constant *SubInit = 0; + Constant *SubInit = nullptr; if (!isa<ConstantExpr>(GEP->getOperand(0))) { ConstantExpr *CE = - dyn_cast_or_null<ConstantExpr>(ConstantFoldInstruction(GEP, TD, TLI)); + dyn_cast_or_null<ConstantExpr>(ConstantFoldInstruction(GEP, DL, TLI)); if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr) SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE); @@ -324,7 +328,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds()) SubInit = Constant::getNullValue(GEP->getType()->getElementType()); } - Changed |= CleanupConstantGlobalUsers(GEP, SubInit, TD, TLI); + Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, TLI); if (GEP->use_empty()) { GEP->eraseFromParent(); @@ -341,7 +345,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, // us, and if they are all dead, nuke them without remorse. if (isSafeToDestroyConstant(C)) { C->destroyConstant(); - CleanupConstantGlobalUsers(V, Init, TD, TLI); + CleanupConstantGlobalUsers(V, Init, DL, TLI); return true; } } @@ -368,15 +372,14 @@ static bool isSafeSROAElementUse(Value *V) { // Otherwise, it must be a GEP. GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I); - if (GEPI == 0) return false; + if (!GEPI) return false; if (GEPI->getNumOperands() < 3 || !isa<Constant>(GEPI->getOperand(1)) || !cast<Constant>(GEPI->getOperand(1))->isNullValue()) return false; - for (Value::use_iterator I = GEPI->use_begin(), E = GEPI->use_end(); - I != E; ++I) - if (!isSafeSROAElementUse(*I)) + for (User *U : GEPI->users()) + if (!isSafeSROAElementUse(U)) return false; return true; } @@ -442,9 +445,10 @@ static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) { } } - for (Value::use_iterator I = U->use_begin(), E = U->use_end(); I != E; ++I) - if (!isSafeSROAElementUse(*I)) + for (User *UU : U->users()) + if (!isSafeSROAElementUse(UU)) return false; + return true; } @@ -452,11 +456,10 @@ static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) { /// is safe for us to perform this transformation. /// static bool GlobalUsersSafeToSRA(GlobalValue *GV) { - for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); - UI != E; ++UI) { - if (!IsUserOfGlobalSafeForSRA(*UI, GV)) + for (User *U : GV->users()) + if (!IsUserOfGlobalSafeForSRA(U, GV)) return false; - } + return true; } @@ -466,10 +469,10 @@ static bool GlobalUsersSafeToSRA(GlobalValue *GV) { /// behavior of the program in a more fine-grained way. We have determined that /// this transformation is safe already. We return the first global variable we /// insert so that the caller can reprocess it. -static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &TD) { +static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { // Make sure this global only has simple uses that we can SRA. if (!GlobalUsersSafeToSRA(GV)) - return 0; + return nullptr; assert(GV->hasLocalLinkage() && !GV->isConstant()); Constant *Init = GV->getInitializer(); @@ -481,11 +484,11 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &TD) { // Get the alignment of the global, either explicit or target-specific. unsigned StartAlignment = GV->getAlignment(); if (StartAlignment == 0) - StartAlignment = TD.getABITypeAlignment(GV->getType()); + StartAlignment = DL.getABITypeAlignment(GV->getType()); if (StructType *STy = dyn_cast<StructType>(Ty)) { NewGlobals.reserve(STy->getNumElements()); - const StructLayout &Layout = *TD.getStructLayout(STy); + const StructLayout &Layout = *DL.getStructLayout(STy); for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { Constant *In = Init->getAggregateElement(i); assert(In && "Couldn't get element of initializer?"); @@ -502,7 +505,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &TD) { // propagate info to each field. uint64_t FieldOffset = Layout.getElementOffset(i); unsigned NewAlign = (unsigned)MinAlign(StartAlignment, FieldOffset); - if (NewAlign > TD.getABITypeAlignment(STy->getElementType(i))) + if (NewAlign > DL.getABITypeAlignment(STy->getElementType(i))) NGV->setAlignment(NewAlign); } } else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) { @@ -513,11 +516,11 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &TD) { NumElements = cast<VectorType>(STy)->getNumElements(); if (NumElements > 16 && GV->hasNUsesOrMore(16)) - return 0; // It's not worth it. + return nullptr; // It's not worth it. NewGlobals.reserve(NumElements); - uint64_t EltSize = TD.getTypeAllocSize(STy->getElementType()); - unsigned EltAlign = TD.getABITypeAlignment(STy->getElementType()); + uint64_t EltSize = DL.getTypeAllocSize(STy->getElementType()); + unsigned EltAlign = DL.getABITypeAlignment(STy->getElementType()); for (unsigned i = 0, e = NumElements; i != e; ++i) { Constant *In = Init->getAggregateElement(i); assert(In && "Couldn't get element of initializer?"); @@ -540,7 +543,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &TD) { } if (NewGlobals.empty()) - return 0; + return nullptr; DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV); @@ -549,7 +552,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &TD) { // Loop over all of the uses of the global, replacing the constantexpr geps, // with smaller constantexpr geps or direct references. while (!GV->use_empty()) { - User *GEP = GV->use_back(); + User *GEP = GV->user_back(); assert(((isa<ConstantExpr>(GEP) && cast<ConstantExpr>(GEP)->getOpcode()==Instruction::GetElementPtr)|| isa<GetElementPtrInst>(GEP)) && "NonGEP CE's are not SRAable!"); @@ -602,7 +605,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &TD) { if (FirstGlobal == i) ++FirstGlobal; } - return FirstGlobal != NewGlobals.size() ? NewGlobals[FirstGlobal] : 0; + return FirstGlobal != NewGlobals.size() ? NewGlobals[FirstGlobal] : nullptr; } /// AllUsesOfValueWillTrapIfNull - Return true if all users of the specified @@ -610,10 +613,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &TD) { /// phi nodes we've seen to avoid reprocessing them. static bool AllUsesOfValueWillTrapIfNull(const Value *V, SmallPtrSet<const PHINode*, 8> &PHIs) { - for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; - ++UI) { - const User *U = *UI; - + for (const User *U : V->users()) if (isa<LoadInst>(U)) { // Will trap. } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) { @@ -641,13 +641,13 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V, if (PHIs.insert(PN) && !AllUsesOfValueWillTrapIfNull(PN, PHIs)) return false; } else if (isa<ICmpInst>(U) && - isa<ConstantPointerNull>(UI->getOperand(1))) { + isa<ConstantPointerNull>(U->getOperand(1))) { // Ignore icmp X, null } else { //cerr << "NONTRAPPING USE: " << *U; return false; } - } + return true; } @@ -655,10 +655,7 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V, /// from GV will trap if the loaded value is null. Note that this also permits /// comparisons of the loaded value against null, as a special case. static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) { - for (Value::const_use_iterator UI = GV->use_begin(), E = GV->use_end(); - UI != E; ++UI) { - const User *U = *UI; - + for (const User *U : GV->users()) if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { SmallPtrSet<const PHINode*, 8> PHIs; if (!AllUsesOfValueWillTrapIfNull(LI, PHIs)) @@ -670,13 +667,12 @@ static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) { //cerr << "UNKNOWN USER OF GLOBAL!: " << *U; return false; } - } return true; } static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) { bool Changed = false; - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ) { + for (auto UI = V->user_begin(), E = V->user_end(); UI != E; ) { Instruction *I = cast<Instruction>(*UI++); if (LoadInst *LI = dyn_cast<LoadInst>(I)) { LI->setOperand(0, NewV); @@ -702,7 +698,7 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) { if (PassedAsArg) { // Being passed as an argument also. Be careful to not invalidate UI! - UI = V->use_begin(); + UI = V->user_begin(); } } } else if (CastInst *CI = dyn_cast<CastInst>(I)) { @@ -742,7 +738,7 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) { /// if the loaded value is dynamically null, then we know that they cannot be /// reachable with a null optimize away the load. static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, - DataLayout *TD, + const DataLayout *DL, TargetLibraryInfo *TLI) { bool Changed = false; @@ -751,7 +747,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, bool AllNonStoreUsesGone = true; // Replace all uses of loads with uses of uses of the stored value. - for (Value::use_iterator GUI = GV->use_begin(), E = GV->use_end(); GUI != E;){ + for (Value::user_iterator GUI = GV->user_begin(), E = GV->user_end(); GUI != E;){ User *GlobalUser = *GUI++; if (LoadInst *LI = dyn_cast<LoadInst>(GlobalUser)) { Changed |= OptimizeAwayTrappingUsesOfValue(LI, LV); @@ -791,7 +787,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, Changed |= CleanupPointerRootUsers(GV, TLI); } else { Changed = true; - CleanupConstantGlobalUsers(GV, 0, TD, TLI); + CleanupConstantGlobalUsers(GV, nullptr, DL, TLI); } if (GV->use_empty()) { DEBUG(dbgs() << " *** GLOBAL NOW DEAD!\n"); @@ -805,11 +801,11 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, /// ConstantPropUsersOf - Walk the use list of V, constant folding all of the /// instructions that are foldable. -static void ConstantPropUsersOf(Value *V, - DataLayout *TD, TargetLibraryInfo *TLI) { - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ) +static void ConstantPropUsersOf(Value *V, const DataLayout *DL, + TargetLibraryInfo *TLI) { + for (Value::user_iterator UI = V->user_begin(), E = V->user_end(); UI != E; ) if (Instruction *I = dyn_cast<Instruction>(*UI++)) - if (Constant *NewC = ConstantFoldInstruction(I, TD, TLI)) { + if (Constant *NewC = ConstantFoldInstruction(I, DL, TLI)) { I->replaceAllUsesWith(NewC); // Advance UI to the next non-I use to avoid invalidating it! @@ -829,7 +825,7 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, ConstantInt *NElements, - DataLayout *TD, + const DataLayout *DL, TargetLibraryInfo *TLI) { DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << " CALL = " << *CI << '\n'); @@ -853,9 +849,9 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, // If there are bitcast users of the malloc (which is typical, usually we have // a malloc + bitcast) then replace them with uses of the new global. Update // other users to use the global as well. - BitCastInst *TheBC = 0; + BitCastInst *TheBC = nullptr; while (!CI->use_empty()) { - Instruction *User = cast<Instruction>(CI->use_back()); + Instruction *User = cast<Instruction>(CI->user_back()); if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) { if (BCI->getType() == NewGV->getType()) { BCI->replaceAllUsesWith(NewGV); @@ -864,7 +860,7 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, BCI->setOperand(0, NewGV); } } else { - if (TheBC == 0) + if (!TheBC) TheBC = new BitCastInst(NewGV, CI->getType(), "newgv", CI); User->replaceUsesOfWith(CI, TheBC); } @@ -886,7 +882,7 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, // Loop over all uses of GV, processing them in turn. while (!GV->use_empty()) { - if (StoreInst *SI = dyn_cast<StoreInst>(GV->use_back())) { + if (StoreInst *SI = dyn_cast<StoreInst>(GV->user_back())) { // The global is initialized when the store to it occurs. new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false, 0, SI->getOrdering(), SI->getSynchScope(), SI); @@ -894,15 +890,15 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, continue; } - LoadInst *LI = cast<LoadInst>(GV->use_back()); + LoadInst *LI = cast<LoadInst>(GV->user_back()); while (!LI->use_empty()) { - Use &LoadUse = LI->use_begin().getUse(); - if (!isa<ICmpInst>(LoadUse.getUser())) { + Use &LoadUse = *LI->use_begin(); + ICmpInst *ICI = dyn_cast<ICmpInst>(LoadUse.getUser()); + if (!ICI) { LoadUse = RepValue; continue; } - ICmpInst *ICI = cast<ICmpInst>(LoadUse.getUser()); // Replace the cmp X, 0 with a use of the bool value. // Sink the load to where the compare was, if atomic rules allow us to. Value *LV = new LoadInst(InitBool, InitBool->getName()+".val", false, 0, @@ -936,7 +932,7 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, // If the initialization boolean was used, insert it, otherwise delete it. if (!InitBoolUsed) { while (!InitBool->use_empty()) // Delete initializations - cast<StoreInst>(InitBool->use_back())->eraseFromParent(); + cast<StoreInst>(InitBool->user_back())->eraseFromParent(); delete InitBool; } else GV->getParent()->getGlobalList().insert(GV, InitBool); @@ -948,9 +944,9 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, // To further other optimizations, loop over all users of NewGV and try to // constant prop them. This will promote GEP instructions with constant // indices into GEP constant-exprs, which will allow global-opt to hack on it. - ConstantPropUsersOf(NewGV, TD, TLI); + ConstantPropUsersOf(NewGV, DL, TLI); if (RepValue != NewGV) - ConstantPropUsersOf(RepValue, TD, TLI); + ConstantPropUsersOf(RepValue, DL, TLI); return NewGV; } @@ -962,9 +958,8 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V, const GlobalVariable *GV, SmallPtrSet<const PHINode*, 8> &PHIs) { - for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); - UI != E; ++UI) { - const Instruction *Inst = cast<Instruction>(*UI); + for (const User *U : V->users()) { + const Instruction *Inst = cast<Instruction>(U); if (isa<LoadInst>(Inst) || isa<CmpInst>(Inst)) { continue; // Fine, ignore. @@ -1011,7 +1006,7 @@ static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V, static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc, GlobalVariable *GV) { while (!Alloc->use_empty()) { - Instruction *U = cast<Instruction>(*Alloc->use_begin()); + Instruction *U = cast<Instruction>(*Alloc->user_begin()); Instruction *InsertPt = U; if (StoreInst *SI = dyn_cast<StoreInst>(U)) { // If this is the store of the allocation into the global, remove it. @@ -1022,7 +1017,7 @@ static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc, } else if (PHINode *PN = dyn_cast<PHINode>(U)) { // Insert the load in the corresponding predecessor, not right before the // PHI. - InsertPt = PN->getIncomingBlock(Alloc->use_begin())->getTerminator(); + InsertPt = PN->getIncomingBlock(*Alloc->use_begin())->getTerminator(); } else if (isa<BitCastInst>(U)) { // Must be bitcast between the malloc and store to initialize the global. ReplaceUsesOfMallocWithGlobal(U, GV); @@ -1032,7 +1027,7 @@ static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc, // If this is a "GEP bitcast" and the user is a store to the global, then // just process it as a bitcast. if (GEPI->hasAllZeroIndices() && GEPI->hasOneUse()) - if (StoreInst *SI = dyn_cast<StoreInst>(GEPI->use_back())) + if (StoreInst *SI = dyn_cast<StoreInst>(GEPI->user_back())) if (SI->getOperand(1) == GV) { // Must be bitcast GEP between the malloc and store to initialize // the global. @@ -1056,19 +1051,18 @@ static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V, SmallPtrSet<const PHINode*, 32> &LoadUsingPHIsPerLoad) { // We permit two users of the load: setcc comparing against the null // pointer, and a getelementptr of a specific form. - for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; - ++UI) { - const Instruction *User = cast<Instruction>(*UI); + for (const User *U : V->users()) { + const Instruction *UI = cast<Instruction>(U); // Comparison against null is ok. - if (const ICmpInst *ICI = dyn_cast<ICmpInst>(User)) { + if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UI)) { if (!isa<ConstantPointerNull>(ICI->getOperand(1))) return false; continue; } // getelementptr is also ok, but only a simple form. - if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { + if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(UI)) { // Must index into the array and into the struct. if (GEPI->getNumOperands() < 3) return false; @@ -1077,7 +1071,7 @@ static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V, continue; } - if (const PHINode *PN = dyn_cast<PHINode>(User)) { + if (const PHINode *PN = dyn_cast<PHINode>(UI)) { if (!LoadUsingPHIsPerLoad.insert(PN)) // This means some phi nodes are dependent on each other. // Avoid infinite looping! @@ -1108,9 +1102,8 @@ static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV, Instruction *StoredVal) { SmallPtrSet<const PHINode*, 32> LoadUsingPHIs; SmallPtrSet<const PHINode*, 32> LoadUsingPHIsPerLoad; - for (Value::const_use_iterator UI = GV->use_begin(), E = GV->use_end(); - UI != E; ++UI) - if (const LoadInst *LI = dyn_cast<LoadInst>(*UI)) { + for (const User *U : GV->users()) + if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { if (!LoadUsesSimpleEnoughForHeapSRA(LI, LoadUsingPHIs, LoadUsingPHIsPerLoad)) return false; @@ -1178,10 +1171,13 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo, } else if (PHINode *PN = dyn_cast<PHINode>(V)) { // PN's type is pointer to struct. Make a new PHI of pointer to struct // field. - StructType *ST = cast<StructType>(PN->getType()->getPointerElementType()); + PointerType *PTy = cast<PointerType>(PN->getType()); + StructType *ST = cast<StructType>(PTy->getElementType()); + + unsigned AS = PTy->getAddressSpace(); PHINode *NewPN = - PHINode::Create(PointerType::getUnqual(ST->getElementType(FieldNo)), + PHINode::Create(PointerType::get(ST->getElementType(FieldNo), AS), PN->getNumIncomingValues(), PN->getName()+".f"+Twine(FieldNo), PN); Result = NewPN; @@ -1249,7 +1245,7 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser, // If this is the first time we've seen this PHI, recursively process all // users. - for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end(); UI != E; ) { + for (auto UI = PN->user_begin(), E = PN->user_end(); UI != E;) { Instruction *User = cast<Instruction>(*UI++); RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite); } @@ -1262,8 +1258,7 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser, static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load, DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues, std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) { - for (Value::use_iterator UI = Load->use_begin(), E = Load->use_end(); - UI != E; ) { + for (auto UI = Load->user_begin(), E = Load->user_end(); UI != E;) { Instruction *User = cast<Instruction>(*UI++); RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite); } @@ -1277,7 +1272,7 @@ static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load, /// PerformHeapAllocSRoA - CI is an allocation of an array of structures. Break /// it up into multiple allocations of arrays of the fields. static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, - Value *NElems, DataLayout *TD, + Value *NElems, const DataLayout *DL, const TargetLibraryInfo *TLI) { DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << " MALLOC = " << *CI << '\n'); Type *MAT = getMallocAllocatedType(CI, TLI); @@ -1294,9 +1289,10 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, std::vector<Value*> FieldGlobals; std::vector<Value*> FieldMallocs; + unsigned AS = GV->getType()->getPointerAddressSpace(); for (unsigned FieldNo = 0, e = STy->getNumElements(); FieldNo != e;++FieldNo){ Type *FieldTy = STy->getElementType(FieldNo); - PointerType *PFieldTy = PointerType::getUnqual(FieldTy); + PointerType *PFieldTy = PointerType::get(FieldTy, AS); GlobalVariable *NGV = new GlobalVariable(*GV->getParent(), @@ -1306,13 +1302,13 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, GV->getThreadLocalMode()); FieldGlobals.push_back(NGV); - unsigned TypeSize = TD->getTypeAllocSize(FieldTy); + unsigned TypeSize = DL->getTypeAllocSize(FieldTy); if (StructType *ST = dyn_cast<StructType>(FieldTy)) - TypeSize = TD->getStructLayout(ST)->getSizeInBytes(); - Type *IntPtrTy = TD->getIntPtrType(CI->getType()); + TypeSize = DL->getStructLayout(ST)->getSizeInBytes(); + Type *IntPtrTy = DL->getIntPtrType(CI->getType()); Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy, ConstantInt::get(IntPtrTy, TypeSize), - NElems, 0, + NElems, nullptr, CI->getName() + ".f" + Twine(FieldNo)); FieldMallocs.push_back(NMI); new StoreInst(NMI, NGV, CI); @@ -1394,7 +1390,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, // Okay, the malloc site is completely handled. All of the uses of GV are now // loads, and all uses of those loads are simple. Rewrite them to use loads // of the per-field globals instead. - for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); UI != E;) { + for (auto UI = GV->user_begin(), E = GV->user_end(); UI != E;) { Instruction *User = cast<Instruction>(*UI++); if (LoadInst *LI = dyn_cast<LoadInst>(User)) { @@ -1469,9 +1465,9 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, Type *AllocTy, AtomicOrdering Ordering, Module::global_iterator &GVI, - DataLayout *TD, + const DataLayout *DL, TargetLibraryInfo *TLI) { - if (!TD) + if (!DL) return false; // If this is a malloc of an abstract type, don't touch it. @@ -1501,7 +1497,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, // This eliminates dynamic allocation, avoids an indirection accessing the // data, and exposes the resultant global to further GlobalOpt. // We cannot optimize the malloc if we cannot determine malloc array size. - Value *NElems = getMallocArraySize(CI, TD, TLI, true); + Value *NElems = getMallocArraySize(CI, DL, TLI, true); if (!NElems) return false; @@ -1509,8 +1505,8 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, // Restrict this transformation to only working on small allocations // (2048 bytes currently), as we don't want to introduce a 16M global or // something. - if (NElements->getZExtValue() * TD->getTypeAllocSize(AllocTy) < 2048) { - GVI = OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, TD, TLI); + if (NElements->getZExtValue() * DL->getTypeAllocSize(AllocTy) < 2048) { + GVI = OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI); return true; } @@ -1539,13 +1535,13 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, // If this is a fixed size array, transform the Malloc to be an alloc of // structs. malloc [100 x struct],1 -> malloc struct, 100 if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) { - Type *IntPtrTy = TD->getIntPtrType(CI->getType()); - unsigned TypeSize = TD->getStructLayout(AllocSTy)->getSizeInBytes(); + Type *IntPtrTy = DL->getIntPtrType(CI->getType()); + unsigned TypeSize = DL->getStructLayout(AllocSTy)->getSizeInBytes(); Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize); Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements()); Instruction *Malloc = CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy, AllocSize, NumElements, - 0, CI->getName()); + nullptr, CI->getName()); Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI); CI->replaceAllUsesWith(Cast); CI->eraseFromParent(); @@ -1555,8 +1551,8 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CI = cast<CallInst>(Malloc); } - GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, TLI, true), - TD, TLI); + GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, DL, TLI, true), + DL, TLI); return true; } @@ -1568,7 +1564,8 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, AtomicOrdering Ordering, Module::global_iterator &GVI, - DataLayout *TD, TargetLibraryInfo *TLI) { + const DataLayout *DL, + TargetLibraryInfo *TLI) { // Ignore no-op GEPs and bitcasts. StoredOnceVal = StoredOnceVal->stripPointerCasts(); @@ -1583,13 +1580,13 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType()); // Optimize away any trapping uses of the loaded value. - if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, TD, TLI)) + if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, TLI)) return true; } else if (CallInst *CI = extractMallocCall(StoredOnceVal, TLI)) { Type *MallocType = getMallocAllocatedType(CI, TLI); if (MallocType && TryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, Ordering, GVI, - TD, TLI)) + DL, TLI)) return true; } } @@ -1616,11 +1613,9 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { // Walk the use list of the global seeing if all the uses are load or store. // If there is anything else, bail out. - for (Value::use_iterator I = GV->use_begin(), E = GV->use_end(); I != E; ++I){ - User *U = *I; + for (User *U : GV->users()) if (!isa<LoadInst>(U) && !isa<StoreInst>(U)) return false; - } DEBUG(dbgs() << " *** SHRINKING TO BOOL: " << *GV); @@ -1645,7 +1640,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { IsOneZero = InitVal->isNullValue() && CI->isOne(); while (!GV->use_empty()) { - Instruction *UI = cast<Instruction>(GV->use_back()); + Instruction *UI = cast<Instruction>(GV->user_back()); if (StoreInst *SI = dyn_cast<StoreInst>(UI)) { // Change the store into a boolean store. bool StoringOther = SI->getOperand(0) == OtherVal; @@ -1705,9 +1700,6 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { /// possible. If we make a change, return true. bool GlobalOpt::ProcessGlobal(GlobalVariable *GV, Module::global_iterator &GVI) { - if (!GV->isDiscardableIfUnused()) - return false; - // Do more involved optimizations if the global is internal. GV->removeDeadConstantUsers(); @@ -1746,7 +1738,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, // and this function is main (which we know is not recursive), we replace // the global with a local alloca in this function. // - // NOTE: It doesn't make sense to promote non single-value types since we + // NOTE: It doesn't make sense to promote non-single-value types since we // are just replacing static memory to stack memory. // // If the global is in different address space, don't bring it to stack. @@ -1761,7 +1753,8 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, ->getEntryBlock().begin()); Type *ElemTy = GV->getType()->getElementType(); // FIXME: Pass Global's alignment when globals have alignment - AllocaInst *Alloca = new AllocaInst(ElemTy, NULL, GV->getName(), &FirstI); + AllocaInst *Alloca = new AllocaInst(ElemTy, nullptr, + GV->getName(), &FirstI); if (!isa<UndefValue>(GV->getInitializer())) new StoreInst(GV->getInitializer(), Alloca, &FirstI); @@ -1783,7 +1776,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, } else { // Delete any stores we can find to the global. We may not be able to // make it completely dead though. - Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(), TD, TLI); + Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI); } // If the global is dead now, delete it. @@ -1799,7 +1792,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, GV->setConstant(true); // Clean up any obviously simplifiable users now. - CleanupConstantGlobalUsers(GV, GV->getInitializer(), TD, TLI); + CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI); // If the global is dead now, just nuke it. if (GV->use_empty()) { @@ -1812,11 +1805,13 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, ++NumMarked; return true; } else if (!GV->getInitializer()->getType()->isSingleValueType()) { - if (DataLayout *TD = getAnalysisIfAvailable<DataLayout>()) - if (GlobalVariable *FirstNewGV = SRAGlobal(GV, *TD)) { + if (DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>()) { + const DataLayout &DL = DLP->getDataLayout(); + if (GlobalVariable *FirstNewGV = SRAGlobal(GV, DL)) { GVI = FirstNewGV; // Don't skip the newly produced globals! return true; } + } } else if (GS.StoredType == GlobalStatus::StoredOnce) { // If the initial value for the global was an undef value, and if only // one other value was stored into it, we can just change the @@ -1828,7 +1823,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, GV->setInitializer(SOVConstant); // Clean up any obviously simplifiable users now. - CleanupConstantGlobalUsers(GV, GV->getInitializer(), TD, TLI); + CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI); if (GV->use_empty()) { DEBUG(dbgs() << " *** Substituting initializer allowed us to " @@ -1845,7 +1840,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, // Try to optimize globals based on the knowledge that only one value // (besides its initializer) is ever stored to the global. if (OptimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, GVI, - TD, TLI)) + DL, TLI)) return true; // Otherwise, if the global was not a boolean, we can shrink it to be a @@ -1866,11 +1861,11 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, /// ChangeCalleesToFastCall - Walk all of the direct calls of the specified /// function, changing them to FastCC. static void ChangeCalleesToFastCall(Function *F) { - for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){ - if (isa<BlockAddress>(*UI)) + for (User *U : F->users()) { + if (isa<BlockAddress>(U)) continue; - CallSite User(cast<Instruction>(*UI)); - User.setCallingConv(CallingConv::Fast); + CallSite CS(cast<Instruction>(U)); + CS.setCallingConv(CallingConv::Fast); } } @@ -1889,21 +1884,31 @@ static AttributeSet StripNest(LLVMContext &C, const AttributeSet &Attrs) { static void RemoveNestAttribute(Function *F) { F->setAttributes(StripNest(F->getContext(), F->getAttributes())); - for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){ - if (isa<BlockAddress>(*UI)) + for (User *U : F->users()) { + if (isa<BlockAddress>(U)) continue; - CallSite User(cast<Instruction>(*UI)); - User.setAttributes(StripNest(F->getContext(), User.getAttributes())); + CallSite CS(cast<Instruction>(U)); + CS.setAttributes(StripNest(F->getContext(), CS.getAttributes())); } } +/// Return true if this is a calling convention that we'd like to change. The +/// idea here is that we don't want to mess with the convention if the user +/// explicitly requested something with performance implications like coldcc, +/// GHC, or anyregcc. +static bool isProfitableToMakeFastCC(Function *F) { + CallingConv::ID CC = F->getCallingConv(); + // FIXME: Is it worth transforming x86_stdcallcc and x86_fastcallcc? + return CC == CallingConv::C || CC == CallingConv::X86_ThisCall; +} + bool GlobalOpt::OptimizeFunctions(Module &M) { bool Changed = false; // Optimize functions. for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) { Function *F = FI++; // Functions without names cannot be referenced outside this module. - if (!F->hasName() && !F->isDeclaration()) + if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage()) F->setLinkage(GlobalValue::InternalLinkage); F->removeDeadConstantUsers(); if (F->isDefTriviallyDead()) { @@ -1911,11 +1916,11 @@ bool GlobalOpt::OptimizeFunctions(Module &M) { Changed = true; ++NumFnDeleted; } else if (F->hasLocalLinkage()) { - if (F->getCallingConv() == CallingConv::C && !F->isVarArg() && + if (isProfitableToMakeFastCC(F) && !F->isVarArg() && !F->hasAddressTaken()) { - // If this function has C calling conventions, is not a varargs - // function, and is only called directly, promote it to use the Fast - // calling convention. + // If this function has a calling convention worth changing, is not a + // varargs function, and is only called directly, promote it to use the + // Fast calling convention. F->setCallingConv(CallingConv::Fast); ChangeCalleesToFastCall(F); ++NumFastCallFns; @@ -1937,139 +1942,41 @@ bool GlobalOpt::OptimizeFunctions(Module &M) { bool GlobalOpt::OptimizeGlobalVars(Module &M) { bool Changed = false; + + SmallSet<const Comdat *, 8> NotDiscardableComdats; + for (const GlobalVariable &GV : M.globals()) + if (const Comdat *C = GV.getComdat()) + if (!GV.isDiscardableIfUnused()) + NotDiscardableComdats.insert(C); + for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); GVI != E; ) { GlobalVariable *GV = GVI++; // Global variables without names cannot be referenced outside this module. - if (!GV->hasName() && !GV->isDeclaration()) + if (!GV->hasName() && !GV->isDeclaration() && !GV->hasLocalLinkage()) GV->setLinkage(GlobalValue::InternalLinkage); // Simplify the initializer. if (GV->hasInitializer()) if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GV->getInitializer())) { - Constant *New = ConstantFoldConstantExpression(CE, TD, TLI); + Constant *New = ConstantFoldConstantExpression(CE, DL, TLI); if (New && New != CE) GV->setInitializer(New); } - Changed |= ProcessGlobal(GV, GVI); - } - return Changed; -} - -/// FindGlobalCtors - Find the llvm.global_ctors list, verifying that all -/// initializers have an init priority of 65535. -GlobalVariable *GlobalOpt::FindGlobalCtors(Module &M) { - GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors"); - if (GV == 0) return 0; - - // Verify that the initializer is simple enough for us to handle. We are - // only allowed to optimize the initializer if it is unique. - if (!GV->hasUniqueInitializer()) return 0; - - if (isa<ConstantAggregateZero>(GV->getInitializer())) - return GV; - ConstantArray *CA = cast<ConstantArray>(GV->getInitializer()); - - for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) { - if (isa<ConstantAggregateZero>(*i)) - continue; - ConstantStruct *CS = cast<ConstantStruct>(*i); - if (isa<ConstantPointerNull>(CS->getOperand(1))) - continue; - - // Must have a function or null ptr. - if (!isa<Function>(CS->getOperand(1))) - return 0; - - // Init priority must be standard. - ConstantInt *CI = cast<ConstantInt>(CS->getOperand(0)); - if (CI->getZExtValue() != 65535) - return 0; - } - - return GV; -} - -/// ParseGlobalCtors - Given a llvm.global_ctors list that we can understand, -/// return a list of the functions and null terminator as a vector. -static std::vector<Function*> ParseGlobalCtors(GlobalVariable *GV) { - if (GV->getInitializer()->isNullValue()) - return std::vector<Function*>(); - ConstantArray *CA = cast<ConstantArray>(GV->getInitializer()); - std::vector<Function*> Result; - Result.reserve(CA->getNumOperands()); - for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) { - ConstantStruct *CS = cast<ConstantStruct>(*i); - Result.push_back(dyn_cast<Function>(CS->getOperand(1))); - } - return Result; -} - -/// InstallGlobalCtors - Given a specified llvm.global_ctors list, install the -/// specified array, returning the new global to use. -static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL, - const std::vector<Function*> &Ctors) { - // If we made a change, reassemble the initializer list. - Constant *CSVals[2]; - CSVals[0] = ConstantInt::get(Type::getInt32Ty(GCL->getContext()), 65535); - CSVals[1] = 0; - - StructType *StructTy = - cast<StructType>(GCL->getType()->getElementType()->getArrayElementType()); - - // Create the new init list. - std::vector<Constant*> CAList; - for (unsigned i = 0, e = Ctors.size(); i != e; ++i) { - if (Ctors[i]) { - CSVals[1] = Ctors[i]; - } else { - Type *FTy = FunctionType::get(Type::getVoidTy(GCL->getContext()), - false); - PointerType *PFTy = PointerType::getUnqual(FTy); - CSVals[1] = Constant::getNullValue(PFTy); - CSVals[0] = ConstantInt::get(Type::getInt32Ty(GCL->getContext()), - 0x7fffffff); + if (GV->isDiscardableIfUnused()) { + if (const Comdat *C = GV->getComdat()) + if (NotDiscardableComdats.count(C)) + continue; + Changed |= ProcessGlobal(GV, GVI); } - CAList.push_back(ConstantStruct::get(StructTy, CSVals)); - } - - // Create the array initializer. - Constant *CA = ConstantArray::get(ArrayType::get(StructTy, - CAList.size()), CAList); - - // If we didn't change the number of elements, don't create a new GV. - if (CA->getType() == GCL->getInitializer()->getType()) { - GCL->setInitializer(CA); - return GCL; - } - - // Create the new global and insert it next to the existing list. - GlobalVariable *NGV = new GlobalVariable(CA->getType(), GCL->isConstant(), - GCL->getLinkage(), CA, "", - GCL->getThreadLocalMode()); - GCL->getParent()->getGlobalList().insert(GCL, NGV); - NGV->takeName(GCL); - - // Nuke the old list, replacing any uses with the new one. - if (!GCL->use_empty()) { - Constant *V = NGV; - if (V->getType() != GCL->getType()) - V = ConstantExpr::getBitCast(V, GCL->getType()); - GCL->replaceAllUsesWith(V); } - GCL->eraseFromParent(); - - if (Ctors.size()) - return NGV; - else - return 0; + return Changed; } - static inline bool isSimpleEnoughValueToCommit(Constant *C, SmallPtrSet<Constant*, 8> &SimpleConstants, - const DataLayout *TD); + const DataLayout *DL); /// isSimpleEnoughValueToCommit - Return true if the specified constant can be @@ -2082,11 +1989,14 @@ isSimpleEnoughValueToCommit(Constant *C, /// time. static bool isSimpleEnoughValueToCommitHelper(Constant *C, SmallPtrSet<Constant*, 8> &SimpleConstants, - const DataLayout *TD) { - // Simple integer, undef, constant aggregate zero, global addresses, etc are - // all supported. - if (C->getNumOperands() == 0 || isa<BlockAddress>(C) || - isa<GlobalValue>(C)) + const DataLayout *DL) { + // Simple global addresses are supported, do not allow dllimport or + // thread-local globals. + if (auto *GV = dyn_cast<GlobalValue>(C)) + return !GV->hasDLLImportStorageClass() && !GV->isThreadLocal(); + + // Simple integer, undef, constant aggregate zero, etc are all supported. + if (C->getNumOperands() == 0 || isa<BlockAddress>(C)) return true; // Aggregate values are safe if all their elements are. @@ -2094,7 +2004,7 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C, isa<ConstantVector>(C)) { for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) { Constant *Op = cast<Constant>(C->getOperand(i)); - if (!isSimpleEnoughValueToCommit(Op, SimpleConstants, TD)) + if (!isSimpleEnoughValueToCommit(Op, SimpleConstants, DL)) return false; } return true; @@ -2107,29 +2017,29 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C, switch (CE->getOpcode()) { case Instruction::BitCast: // Bitcast is fine if the casted value is fine. - return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD); + return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL); case Instruction::IntToPtr: case Instruction::PtrToInt: // int <=> ptr is fine if the int type is the same size as the // pointer type. - if (!TD || TD->getTypeSizeInBits(CE->getType()) != - TD->getTypeSizeInBits(CE->getOperand(0)->getType())) + if (!DL || DL->getTypeSizeInBits(CE->getType()) != + DL->getTypeSizeInBits(CE->getOperand(0)->getType())) return false; - return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD); + return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL); // GEP is fine if it is simple + constant offset. case Instruction::GetElementPtr: for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i) if (!isa<ConstantInt>(CE->getOperand(i))) return false; - return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD); + return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL); case Instruction::Add: // We allow simple+cst. if (!isa<ConstantInt>(CE->getOperand(1))) return false; - return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD); + return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL); } return false; } @@ -2137,11 +2047,11 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C, static inline bool isSimpleEnoughValueToCommit(Constant *C, SmallPtrSet<Constant*, 8> &SimpleConstants, - const DataLayout *TD) { + const DataLayout *DL) { // If we already checked this constant, we win. if (!SimpleConstants.insert(C)) return true; // Check the constant. - return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, TD); + return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, DL); } @@ -2157,8 +2067,7 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) { return false; if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) - // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or - // external globals. + // Do not allow weak/*_odr/linkonce linkage or external globals. return GV->hasUniqueInitializer(); if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) { @@ -2173,7 +2082,7 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) { return false; // The first index must be zero. - ConstantInt *CI = dyn_cast<ConstantInt>(*llvm::next(CE->op_begin())); + ConstantInt *CI = dyn_cast<ConstantInt>(*std::next(CE->op_begin())); if (!CI || !CI->isZero()) return false; // The remaining indices must be compile-time known integers within the @@ -2268,24 +2177,18 @@ namespace { /// Once an evaluation call fails, the evaluation object should not be reused. class Evaluator { public: - Evaluator(const DataLayout *TD, const TargetLibraryInfo *TLI) - : TD(TD), TLI(TLI) { - ValueStack.push_back(new DenseMap<Value*, Constant*>); + Evaluator(const DataLayout *DL, const TargetLibraryInfo *TLI) + : DL(DL), TLI(TLI) { + ValueStack.emplace_back(); } ~Evaluator() { - DeleteContainerPointers(ValueStack); - while (!AllocaTmps.empty()) { - GlobalVariable *Tmp = AllocaTmps.back(); - AllocaTmps.pop_back(); - + for (auto &Tmp : AllocaTmps) // If there are still users of the alloca, the program is doing something // silly, e.g. storing the address of the alloca somewhere and using it // later. Since this is undefined, we'll just make it be null. if (!Tmp->use_empty()) Tmp->replaceAllUsesWith(Constant::getNullValue(Tmp->getType())); - delete Tmp; - } } /// EvaluateFunction - Evaluate a call to function F, returning true if @@ -2301,13 +2204,13 @@ public: Constant *getVal(Value *V) { if (Constant *CV = dyn_cast<Constant>(V)) return CV; - Constant *R = ValueStack.back()->lookup(V); + Constant *R = ValueStack.back().lookup(V); assert(R && "Reference to an uncomputed value!"); return R; } void setVal(Value *V, Constant *C) { - ValueStack.back()->operator[](V) = C; + ValueStack.back()[V] = C; } const DenseMap<Constant*, Constant*> &getMutatedMemory() const { @@ -2322,9 +2225,9 @@ private: Constant *ComputeLoadResult(Constant *P); /// ValueStack - As we compute SSA register values, we store their contents - /// here. The back of the vector contains the current function and the stack + /// here. The back of the deque contains the current function and the stack /// contains the values in the calling frames. - SmallVector<DenseMap<Value*, Constant*>*, 4> ValueStack; + std::deque<DenseMap<Value*, Constant*>> ValueStack; /// CallStack - This is used to detect recursion. In pathological situations /// we could hit exponential behavior, but at least there is nothing @@ -2339,7 +2242,7 @@ private: /// AllocaTmps - To 'execute' an alloca, we create a temporary global variable /// to represent its body. This vector is needed so we can delete the /// temporary globals when we are done. - SmallVector<GlobalVariable*, 32> AllocaTmps; + SmallVector<std::unique_ptr<GlobalVariable>, 32> AllocaTmps; /// Invariants - These global variables have been marked invariant by the /// static constructor. @@ -2349,7 +2252,7 @@ private: /// simple enough to live in a static initializer of a global. SmallPtrSet<Constant*, 8> SimpleConstants; - const DataLayout *TD; + const DataLayout *DL; const TargetLibraryInfo *TLI; }; @@ -2368,7 +2271,7 @@ Constant *Evaluator::ComputeLoadResult(Constant *P) { if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) { if (GV->hasDefinitiveInitializer()) return GV->getInitializer(); - return 0; + return nullptr; } // Handle a constantexpr getelementptr. @@ -2380,7 +2283,7 @@ Constant *Evaluator::ComputeLoadResult(Constant *P) { return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE); } - return 0; // don't know how to evaluate. + return nullptr; // don't know how to evaluate. } /// EvaluateBlock - Evaluate all instructions in block BB, returning true if @@ -2390,7 +2293,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB) { // This is the main evaluation loop. while (1) { - Constant *InstResult = 0; + Constant *InstResult = nullptr; DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n"); @@ -2402,7 +2305,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, Constant *Ptr = getVal(SI->getOperand(1)); if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) { DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr); - Ptr = ConstantFoldConstantExpression(CE, TD, TLI); + Ptr = ConstantFoldConstantExpression(CE, DL, TLI); DEBUG(dbgs() << "; To: " << *Ptr << "\n"); } if (!isSimpleEnoughPointerToCommit(Ptr)) { @@ -2415,7 +2318,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, // If this might be too difficult for the backend to handle (e.g. the addr // of one global variable divided by another) then we can't commit it. - if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, TD)) { + if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, DL)) { DEBUG(dbgs() << "Store value is too complex to evaluate store. " << *Val << "\n"); return false; @@ -2447,7 +2350,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, Ptr = ConstantExpr::getGetElementPtr(Ptr, IdxList); if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) - Ptr = ConstantFoldConstantExpression(CE, TD, TLI); + Ptr = ConstantFoldConstantExpression(CE, DL, TLI); // If we can't improve the situation by introspecting NewTy, // we have to give up. @@ -2511,12 +2414,12 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, Constant *Ptr = getVal(LI->getOperand(0)); if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) { - Ptr = ConstantFoldConstantExpression(CE, TD, TLI); + Ptr = ConstantFoldConstantExpression(CE, DL, TLI); DEBUG(dbgs() << "Found a constant pointer expression, constant " "folding: " << *Ptr << "\n"); } InstResult = ComputeLoadResult(Ptr); - if (InstResult == 0) { + if (!InstResult) { DEBUG(dbgs() << "Failed to compute load result. Can not evaluate load." "\n"); return false; // Could not evaluate load. @@ -2529,11 +2432,10 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, return false; // Cannot handle array allocs. } Type *Ty = AI->getType()->getElementType(); - AllocaTmps.push_back(new GlobalVariable(Ty, false, - GlobalValue::InternalLinkage, - UndefValue::get(Ty), - AI->getName())); - InstResult = AllocaTmps.back(); + AllocaTmps.push_back( + make_unique<GlobalVariable>(Ty, false, GlobalValue::InternalLinkage, + UndefValue::get(Ty), AI->getName())); + InstResult = AllocaTmps.back().get(); DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n"); } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) { CallSite CS(CurInst); @@ -2580,7 +2482,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, // We don't insert an entry into Values, as it doesn't have a // meaningful return value. if (!II->use_empty()) { - DEBUG(dbgs() << "Found unused invariant_start. Cant evaluate.\n"); + DEBUG(dbgs() << "Found unused invariant_start. Can't evaluate.\n"); return false; } ConstantInt *Size = cast<ConstantInt>(II->getArgOperand(0)); @@ -2588,9 +2490,9 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, Value *Ptr = PtrArg->stripPointerCasts(); if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) { Type *ElemTy = cast<PointerType>(GV->getType())->getElementType(); - if (TD && !Size->isAllOnesValue() && + if (DL && !Size->isAllOnesValue() && Size->getValue().getLimitedValue() >= - TD->getTypeStoreSize(ElemTy)) { + DL->getTypeStoreSize(ElemTy)) { Invariants.insert(GV); DEBUG(dbgs() << "Found a global var that is an invariant: " << *GV << "\n"); @@ -2635,17 +2537,17 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, return false; } - Constant *RetVal = 0; + Constant *RetVal = nullptr; // Execute the call, if successful, use the return value. - ValueStack.push_back(new DenseMap<Value*, Constant*>); + ValueStack.emplace_back(); if (!EvaluateFunction(Callee, RetVal, Formals)) { DEBUG(dbgs() << "Failed to evaluate function.\n"); return false; } - delete ValueStack.pop_back_val(); + ValueStack.pop_back(); InstResult = RetVal; - if (InstResult != NULL) { + if (InstResult) { DEBUG(dbgs() << "Successfully evaluated function. Result: " << InstResult << "\n\n"); } else { @@ -2677,7 +2579,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, else return false; // Cannot determine. } else if (isa<ReturnInst>(CurInst)) { - NextBB = 0; + NextBB = nullptr; } else { // invoke, unwind, resume, unreachable. DEBUG(dbgs() << "Can not handle terminator."); @@ -2696,7 +2598,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, if (!CurInst->use_empty()) { if (ConstantExpr *CE = dyn_cast<ConstantExpr>(InstResult)) - InstResult = ConstantFoldConstantExpression(CE, TD, TLI); + InstResult = ConstantFoldConstantExpression(CE, DL, TLI); setVal(CurInst, InstResult); } @@ -2742,13 +2644,13 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal, BasicBlock::iterator CurInst = CurBB->begin(); while (1) { - BasicBlock *NextBB = 0; // Initialized to avoid compiler warnings. + BasicBlock *NextBB = nullptr; // Initialized to avoid compiler warnings. DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n"); if (!EvaluateBlock(CurInst, NextBB)) return false; - if (NextBB == 0) { + if (!NextBB) { // Successfully running until there's no next block means that we found // the return. Fill it the return value and pop the call stack. ReturnInst *RI = cast<ReturnInst>(CurBB->getTerminator()); @@ -2767,7 +2669,7 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal, // Okay, we have never been in this block before. Check to see if there // are any PHI nodes. If so, evaluate them with information about where // we came from. - PHINode *PN = 0; + PHINode *PN = nullptr; for (CurInst = NextBB->begin(); (PN = dyn_cast<PHINode>(CurInst)); ++CurInst) setVal(PN, getVal(PN->getIncomingValueForBlock(CurBB))); @@ -2779,15 +2681,17 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal, /// EvaluateStaticConstructor - Evaluate static constructors in the function, if /// we can. Return true if we can, false otherwise. -static bool EvaluateStaticConstructor(Function *F, const DataLayout *TD, +static bool EvaluateStaticConstructor(Function *F, const DataLayout *DL, const TargetLibraryInfo *TLI) { // Call the function. - Evaluator Eval(TD, TLI); + Evaluator Eval(DL, TLI); Constant *RetValDummy; bool EvalSuccess = Eval.EvaluateFunction(F, RetValDummy, SmallVector<Constant*, 0>()); if (EvalSuccess) { + ++NumCtorsEvaluated; + // We succeeded at evaluation: commit the result. DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '" << F->getName() << "' to " << Eval.getMutatedMemory().size() @@ -2805,46 +2709,6 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout *TD, return EvalSuccess; } -/// OptimizeGlobalCtorsList - Simplify and evaluation global ctors if possible. -/// Return true if anything changed. -bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) { - std::vector<Function*> Ctors = ParseGlobalCtors(GCL); - bool MadeChange = false; - if (Ctors.empty()) return false; - - // Loop over global ctors, optimizing them when we can. - for (unsigned i = 0; i != Ctors.size(); ++i) { - Function *F = Ctors[i]; - // Found a null terminator in the middle of the list, prune off the rest of - // the list. - if (F == 0) { - if (i != Ctors.size()-1) { - Ctors.resize(i+1); - MadeChange = true; - } - break; - } - DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n"); - - // We cannot simplify external ctor functions. - if (F->empty()) continue; - - // If we can evaluate the ctor at compile time, do. - if (EvaluateStaticConstructor(F, TD, TLI)) { - Ctors.erase(Ctors.begin()+i); - MadeChange = true; - --i; - ++NumCtorsEvaluated; - continue; - } - } - - if (!MadeChange) return false; - - GCL = InstallGlobalCtors(GCL, Ctors); - return true; -} - static int compareNames(Constant *const *A, Constant *const *B) { return (*A)->getName().compare((*B)->getName()); } @@ -2856,12 +2720,14 @@ static void setUsedInitializer(GlobalVariable &V, return; } - SmallVector<llvm::Constant *, 8> UsedArray; - PointerType *Int8PtrTy = Type::getInt8PtrTy(V.getContext()); + // Type of pointer to the array of pointers. + PointerType *Int8PtrTy = Type::getInt8PtrTy(V.getContext(), 0); + SmallVector<llvm::Constant *, 8> UsedArray; for (SmallPtrSet<GlobalValue *, 8>::iterator I = Init.begin(), E = Init.end(); I != E; ++I) { - Constant *Cast = llvm::ConstantExpr::getBitCast(*I, Int8PtrTy); + Constant *Cast + = ConstantExpr::getPointerBitCastOrAddrSpaceCast(*I, Int8PtrTy); UsedArray.push_back(Cast); } // Sort to get deterministic order. @@ -2992,14 +2858,19 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) { I != E;) { Module::alias_iterator J = I++; // Aliases without names cannot be referenced outside this module. - if (!J->hasName() && !J->isDeclaration()) + if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage()) J->setLinkage(GlobalValue::InternalLinkage); // If the aliasee may change at link time, nothing can be done - bail out. if (J->mayBeOverridden()) continue; Constant *Aliasee = J->getAliasee(); - GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts()); + GlobalValue *Target = dyn_cast<GlobalValue>(Aliasee->stripPointerCasts()); + // We can't trivially replace the alias with the aliasee if the aliasee is + // non-trivial in some way. + // TODO: Try to handle non-zero GEPs of local aliasees. + if (!Target) + continue; Target->removeDeadConstantUsers(); // Make all users of the alias use the aliasee instead. @@ -3007,7 +2878,7 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) { if (!hasUsesToReplace(*J, Used, RenameTarget)) continue; - J->replaceAllUsesWith(Aliasee); + J->replaceAllUsesWith(ConstantExpr::getBitCast(Aliasee, J->getType())); ++NumAliasesResolved; Changed = true; @@ -3015,7 +2886,8 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) { // Give the aliasee the name, linkage and other attributes of the alias. Target->takeName(J); Target->setLinkage(J->getLinkage()); - Target->GlobalValue::copyAttributesFrom(J); + Target->setVisibility(J->getVisibility()); + Target->setDLLStorageClass(J->getDLLStorageClass()); if (Used.usedErase(J)) Used.usedInsert(Target); @@ -3038,12 +2910,12 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) { static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::cxa_atexit)) - return 0; + return nullptr; Function *Fn = M.getFunction(TLI->getName(LibFunc::cxa_atexit)); if (!Fn) - return 0; + return nullptr; FunctionType *FTy = Fn->getFunctionType(); @@ -3054,7 +2926,7 @@ static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) { !FTy->getParamType(0)->isPointerTy() || !FTy->getParamType(1)->isPointerTy() || !FTy->getParamType(2)->isPointerTy()) - return 0; + return nullptr; return Fn; } @@ -3122,8 +2994,8 @@ bool GlobalOpt::OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) { // and remove them. bool Changed = false; - for (Function::use_iterator I = CXAAtExitFn->use_begin(), - E = CXAAtExitFn->use_end(); I != E;) { + for (auto I = CXAAtExitFn->user_begin(), E = CXAAtExitFn->user_end(); + I != E;) { // We're only interested in calls. Theoretically, we could handle invoke // instructions as well, but neither llvm-gcc nor clang generate invokes // to __cxa_atexit. @@ -3155,12 +3027,10 @@ bool GlobalOpt::OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) { bool GlobalOpt::runOnModule(Module &M) { bool Changed = false; - TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = &getAnalysis<TargetLibraryInfo>(); - // Try to find the llvm.globalctors list. - GlobalVariable *GlobalCtors = FindGlobalCtors(M); - bool LocalChange = true; while (LocalChange) { LocalChange = false; @@ -3169,8 +3039,9 @@ bool GlobalOpt::runOnModule(Module &M) { LocalChange |= OptimizeFunctions(M); // Optimize global_ctors list. - if (GlobalCtors) - LocalChange |= OptimizeGlobalCtorsList(GlobalCtors); + LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) { + return EvaluateStaticConstructor(F, DL, TLI); + }); // Optimize non-address-taken globals. LocalChange |= OptimizeGlobalVars(M); diff --git a/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp b/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp index 4ac1dfc..af541d1 100644 --- a/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp +++ b/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp @@ -15,18 +15,19 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "ipconstprop" #include "llvm/Transforms/IPO.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" -#include "llvm/Support/CallSite.h" using namespace llvm; +#define DEBUG_TYPE "ipconstprop" + STATISTIC(NumArgumentsProped, "Number of args turned into constants"); STATISTIC(NumReturnValProped, "Number of return values turned into constants"); @@ -39,7 +40,7 @@ namespace { initializeIPCPPass(*PassRegistry::getPassRegistry()); } - bool runOnModule(Module &M); + bool runOnModule(Module &M) override; private: bool PropagateConstantsIntoArguments(Function &F); bool PropagateConstantReturn(Function &F); @@ -86,18 +87,18 @@ bool IPCP::PropagateConstantsIntoArguments(Function &F) { ArgumentConstants.resize(F.arg_size()); unsigned NumNonconstant = 0; - for (Value::use_iterator UI = F.use_begin(), E = F.use_end(); UI != E; ++UI) { - User *U = *UI; + for (Use &U : F.uses()) { + User *UR = U.getUser(); // Ignore blockaddress uses. - if (isa<BlockAddress>(U)) continue; + if (isa<BlockAddress>(UR)) continue; // Used by a non-instruction, or not the callee of a function, do not // transform. - if (!isa<CallInst>(U) && !isa<InvokeInst>(U)) + if (!isa<CallInst>(UR) && !isa<InvokeInst>(UR)) return false; - CallSite CS(cast<Instruction>(U)); - if (!CS.isCallee(UI)) + CallSite CS(cast<Instruction>(UR)); + if (!CS.isCallee(&U)) return false; // Check out all of the potentially constant arguments. Note that we don't @@ -112,7 +113,7 @@ bool IPCP::PropagateConstantsIntoArguments(Function &F) { continue; Constant *C = dyn_cast<Constant>(*AI); - if (C && ArgumentConstants[i].first == 0) { + if (C && ArgumentConstants[i].first == nullptr) { ArgumentConstants[i].first = C; // First constant seen. } else if (C && ArgumentConstants[i].first == C) { // Still the constant value we think it is. @@ -135,11 +136,11 @@ bool IPCP::PropagateConstantsIntoArguments(Function &F) { for (unsigned i = 0, e = ArgumentConstants.size(); i != e; ++i, ++AI) { // Do we have a constant argument? if (ArgumentConstants[i].second || AI->use_empty() || - (AI->hasByValAttr() && !F.onlyReadsMemory())) + AI->hasInAllocaAttr() || (AI->hasByValAttr() && !F.onlyReadsMemory())) continue; Value *V = ArgumentConstants[i].first; - if (V == 0) V = UndefValue::get(AI->getType()); + if (!V) V = UndefValue::get(AI->getType()); AI->replaceAllUsesWith(V); ++NumArgumentsProped; MadeChange = true; @@ -209,8 +210,8 @@ bool IPCP::PropagateConstantReturn(Function &F) { } // Different or no known return value? Don't propagate this return // value. - RetVals[i] = 0; - // All values non constant? Stop looking. + RetVals[i] = nullptr; + // All values non-constant? Stop looking. if (++NumNonConstant == RetVals.size()) return false; } @@ -220,13 +221,13 @@ bool IPCP::PropagateConstantReturn(Function &F) { // over all users, replacing any uses of the return value with the returned // constant. bool MadeChange = false; - for (Value::use_iterator UI = F.use_begin(), E = F.use_end(); UI != E; ++UI) { - CallSite CS(*UI); + for (Use &U : F.uses()) { + CallSite CS(U.getUser()); Instruction* Call = CS.getInstruction(); // Not a call instruction or a call instruction that's not calling F // directly? - if (!Call || !CS.isCallee(UI)) + if (!Call || !CS.isCallee(&U)) continue; // Call result not used? @@ -235,7 +236,7 @@ bool IPCP::PropagateConstantReturn(Function &F) { MadeChange = true; - if (STy == 0) { + if (!STy) { Value* New = RetVals[0]; if (Argument *A = dyn_cast<Argument>(New)) // Was an argument returned? Then find the corresponding argument in @@ -244,9 +245,8 @@ bool IPCP::PropagateConstantReturn(Function &F) { Call->replaceAllUsesWith(New); continue; } - - for (Value::use_iterator I = Call->use_begin(), E = Call->use_end(); - I != E;) { + + for (auto I = Call->user_begin(), E = Call->user_end(); I != E;) { Instruction *Ins = cast<Instruction>(*I); // Increment now, so we can remove the use diff --git a/contrib/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm/lib/Transforms/IPO/IPO.cpp index 5d563d8..b4d31d8 100644 --- a/contrib/llvm/lib/Transforms/IPO/IPO.cpp +++ b/contrib/llvm/lib/Transforms/IPO/IPO.cpp @@ -44,6 +44,7 @@ void llvm::initializeIPO(PassRegistry &Registry) { initializeStripDebugDeclarePass(Registry); initializeStripDeadDebugInfoPass(Registry); initializeStripNonDebugSymbolsPass(Registry); + initializeBarrierNoopPass(Registry); } void LLVMInitializeIPO(LLVMPassRegistryRef R) { diff --git a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp index 437597e..624cb90 100644 --- a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp +++ b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp @@ -12,22 +12,23 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "inline" #include "llvm/Transforms/IPO.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" -#include "llvm/Support/CallSite.h" #include "llvm/Transforms/IPO/InlinerPass.h" using namespace llvm; +#define DEBUG_TYPE "inline" + namespace { /// \brief Inliner pass which only handles "always inline" functions. @@ -36,24 +37,25 @@ class AlwaysInliner : public Inliner { public: // Use extremely low threshold. - AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/ true), ICA(0) { + AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/ true), + ICA(nullptr) { initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry()); } AlwaysInliner(bool InsertLifetime) - : Inliner(ID, -2000000000, InsertLifetime), ICA(0) { + : Inliner(ID, -2000000000, InsertLifetime), ICA(nullptr) { initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry()); } static char ID; // Pass identification, replacement for typeid - virtual InlineCost getInlineCost(CallSite CS); + InlineCost getInlineCost(CallSite CS) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const; - virtual bool runOnSCC(CallGraphSCC &SCC); + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnSCC(CallGraphSCC &SCC) override; using llvm::Pass::doFinalization; - virtual bool doFinalization(CallGraph &CG) { + bool doFinalization(CallGraph &CG) override { return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/ true); } }; @@ -63,7 +65,7 @@ public: char AlwaysInliner::ID = 0; INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline", "Inliner for always_inline functions", false, false) -INITIALIZE_PASS_DEPENDENCY(CallGraph) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis) INITIALIZE_PASS_END(AlwaysInliner, "always-inline", "Inliner for always_inline functions", false, false) @@ -93,8 +95,7 @@ InlineCost AlwaysInliner::getInlineCost(CallSite CS) { // that are viable for inlining. FIXME: We shouldn't even get here for // declarations. if (Callee && !Callee->isDeclaration() && - Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::AlwaysInline) && + CS.hasFnAttr(Attribute::AlwaysInline) && ICA->isInlineViable(*Callee)) return InlineCost::getAlways(); diff --git a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp index 57379a3..d189756 100644 --- a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp +++ b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp @@ -11,21 +11,22 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "inline" #include "llvm/Transforms/IPO.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" -#include "llvm/Support/CallSite.h" #include "llvm/Transforms/IPO/InlinerPass.h" using namespace llvm; +#define DEBUG_TYPE "inline" + namespace { /// \brief Actual inliner pass implementation. @@ -37,31 +38,42 @@ class SimpleInliner : public Inliner { InlineCostAnalysis *ICA; public: - SimpleInliner() : Inliner(ID), ICA(0) { + SimpleInliner() : Inliner(ID), ICA(nullptr) { initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); } SimpleInliner(int Threshold) - : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(0) { + : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(nullptr) { initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); } static char ID; // Pass identification, replacement for typeid - InlineCost getInlineCost(CallSite CS) { + InlineCost getInlineCost(CallSite CS) override { return ICA->getInlineCost(CS, getInlineThreshold(CS)); } - virtual bool runOnSCC(CallGraphSCC &SCC); - virtual void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnSCC(CallGraphSCC &SCC) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; }; +static int computeThresholdFromOptLevels(unsigned OptLevel, + unsigned SizeOptLevel) { + if (OptLevel > 2) + return 275; + if (SizeOptLevel == 1) // -Os + return 75; + if (SizeOptLevel == 2) // -Oz + return 25; + return 225; +} + } // end anonymous namespace char SimpleInliner::ID = 0; INITIALIZE_PASS_BEGIN(SimpleInliner, "inline", "Function Integration/Inlining", false, false) -INITIALIZE_PASS_DEPENDENCY(CallGraph) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis) INITIALIZE_PASS_END(SimpleInliner, "inline", "Function Integration/Inlining", false, false) @@ -72,6 +84,12 @@ Pass *llvm::createFunctionInliningPass(int Threshold) { return new SimpleInliner(Threshold); } +Pass *llvm::createFunctionInliningPass(unsigned OptLevel, + unsigned SizeOptLevel) { + return new SimpleInliner( + computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); +} + bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) { ICA = &getAnalysis<InlineCostAnalysis>(); return Inliner::runOnSCC(SCC); diff --git a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp index d75d6ca..9087ab2 100644 --- a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp @@ -13,17 +13,17 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "inline" #include "llvm/Transforms/IPO/InlinerPass.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" -#include "llvm/Support/CallSite.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -32,6 +32,8 @@ #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "inline" + STATISTIC(NumInlined, "Number of functions inlined"); STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined"); STATISTIC(NumDeleted, "Number of functions deleted because all callers found"); @@ -50,6 +52,13 @@ static cl::opt<int> HintThreshold("inlinehint-threshold", cl::Hidden, cl::init(325), cl::desc("Threshold for inlining functions with inline hint")); +// We instroduce this threshold to help performance of instrumentation based +// PGO before we actually hook up inliner with analysis passes such as BPI and +// BFI. +static cl::opt<int> +ColdThreshold("inlinecold-threshold", cl::Hidden, cl::init(225), + cl::desc("Threshold for inlining functions with cold attribute")); + // Threshold to use when optsize is specified (and there is no -inline-limit). const int OptSizeThreshold = 75; @@ -117,7 +126,7 @@ static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) { static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, InlinedArrayAllocasTy &InlinedArrayAllocas, int InlineHistory, bool InsertLifetime, - const DataLayout *TD) { + const DataLayout *DL) { Function *Callee = CS.getCalledFunction(); Function *Caller = CS.getCaller(); @@ -176,7 +185,7 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, // canonicalized to be an allocation *of* an array), or allocations whose // type is not itself an array (because we're afraid of pessimizing SRoA). ArrayType *ATy = dyn_cast<ArrayType>(AI->getAllocatedType()); - if (ATy == 0 || AI->isArrayAllocation()) + if (!ATy || AI->isArrayAllocation()) continue; // Get the list of all available allocas for this array type. @@ -196,7 +205,7 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, // If we don't have data layout information, and only one alloca is using // the target default, then we can't safely merge them because we can't // pick the greater alignment. - if (!TD && (!Align1 || !Align2) && Align1 != Align2) + if (!DL && (!Align1 || !Align2) && Align1 != Align2) continue; // The available alloca has to be in the right function, not in some other @@ -218,8 +227,8 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, if (Align1 != Align2) { if (!Align1 || !Align2) { - assert(TD && "DataLayout required to compare default alignments"); - unsigned TypeAlign = TD->getABITypeAlignment(AI->getAllocatedType()); + assert(DL && "DataLayout required to compare default alignments"); + unsigned TypeAlign = DL->getABITypeAlignment(AI->getAllocatedType()); Align1 = Align1 ? Align1 : TypeAlign; Align2 = Align2 ? Align2 : TypeAlign; @@ -232,7 +241,7 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, AI->eraseFromParent(); MergedAwayAlloca = true; ++NumMergedAllocas; - IFI.StaticAllocas[AllocaNo] = 0; + IFI.StaticAllocas[AllocaNo] = nullptr; break; } @@ -277,9 +286,28 @@ unsigned Inliner::getInlineThreshold(CallSite CS) const { Attribute::MinSize)) thres = HintThreshold; + // Listen to the cold attribute when it would decrease the threshold. + bool ColdCallee = Callee && !Callee->isDeclaration() && + Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::Cold); + // Command line argument for InlineLimit will override the default + // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold, + // do not use the default cold threshold even if it is smaller. + if ((InlineLimit.getNumOccurrences() == 0 || + ColdThreshold.getNumOccurrences() > 0) && ColdCallee && + ColdThreshold < thres) + thres = ColdThreshold; + return thres; } +static void emitAnalysis(CallSite CS, const Twine &Msg) { + Function *Caller = CS.getCaller(); + LLVMContext &Ctx = Caller->getContext(); + DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); + emitOptimizationRemarkAnalysis(Ctx, DEBUG_TYPE, *Caller, DLoc, Msg); +} + /// shouldInline - Return true if the inliner should attempt to inline /// at the given CallSite. bool Inliner::shouldInline(CallSite CS) { @@ -288,12 +316,16 @@ bool Inliner::shouldInline(CallSite CS) { if (IC.isAlways()) { DEBUG(dbgs() << " Inlining: cost=always" << ", Call: " << *CS.getInstruction() << "\n"); + emitAnalysis(CS, Twine(CS.getCalledFunction()->getName()) + + " should always be inlined (cost=always)"); return true; } if (IC.isNever()) { DEBUG(dbgs() << " NOT Inlining: cost=never" << ", Call: " << *CS.getInstruction() << "\n"); + emitAnalysis(CS, Twine(CS.getCalledFunction()->getName() + + " should never be inlined (cost=never)")); return false; } @@ -302,6 +334,10 @@ bool Inliner::shouldInline(CallSite CS) { DEBUG(dbgs() << " NOT Inlining: cost=" << IC.getCost() << ", thres=" << (IC.getCostDelta() + IC.getCost()) << ", Call: " << *CS.getInstruction() << "\n"); + emitAnalysis(CS, Twine(CS.getCalledFunction()->getName() + + " too costly to inline (cost=") + + Twine(IC.getCost()) + ", threshold=" + + Twine(IC.getCostDelta() + IC.getCost()) + ")"); return false; } @@ -330,9 +366,8 @@ bool Inliner::shouldInline(CallSite CS) { bool callerWillBeRemoved = Caller->hasLocalLinkage(); // This bool tracks what happens if we DO inline C into B. bool inliningPreventsSomeOuterInline = false; - for (Value::use_iterator I = Caller->use_begin(), E =Caller->use_end(); - I != E; ++I) { - CallSite CS2(*I); + for (User *U : Caller->users()) { + CallSite CS2(U); // If this isn't a call to Caller (it could be some other sort // of reference) skip it. Such references will prevent the caller @@ -363,13 +398,18 @@ bool Inliner::shouldInline(CallSite CS) { // one is set very low by getInlineCost, in anticipation that Caller will // be removed entirely. We did not account for this above unless there // is only one caller of Caller. - if (callerWillBeRemoved && Caller->use_begin() != Caller->use_end()) + if (callerWillBeRemoved && !Caller->use_empty()) TotalSecondaryCost += InlineConstants::LastCallToStaticBonus; if (inliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost()) { DEBUG(dbgs() << " NOT Inlining: " << *CS.getInstruction() << " Cost = " << IC.getCost() << ", outer Cost = " << TotalSecondaryCost << '\n'); + emitAnalysis( + CS, Twine("Not inlining. Cost of inlining " + + CS.getCalledFunction()->getName() + + " increases the cost of inlining " + + CS.getCaller()->getName() + " in other contexts")); return false; } } @@ -377,6 +417,10 @@ bool Inliner::shouldInline(CallSite CS) { DEBUG(dbgs() << " Inlining: cost=" << IC.getCost() << ", thres=" << (IC.getCostDelta() + IC.getCost()) << ", Call: " << *CS.getInstruction() << '\n'); + emitAnalysis( + CS, CS.getCalledFunction()->getName() + Twine(" can be inlined into ") + + CS.getCaller()->getName() + " with cost=" + Twine(IC.getCost()) + + " (threshold=" + Twine(IC.getCostDelta() + IC.getCost()) + ")"); return true; } @@ -395,8 +439,9 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID, } bool Inliner::runOnSCC(CallGraphSCC &SCC) { - CallGraph &CG = getAnalysis<CallGraph>(); - const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); + CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; const TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); SmallPtrSet<Function*, 8> SCCFunctions; @@ -456,7 +501,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { InlinedArrayAllocasTy InlinedArrayAllocas; - InlineFunctionInfo InlineInfo(&CG, TD); + InlineFunctionInfo InlineInfo(&CG, DL); // Now that we have all of the call sites, loop over them and inline them if // it looks profitable to do so. @@ -485,7 +530,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { ++NumCallsDeleted; } else { // We can only inline direct calls to non-declarations. - if (Callee == 0 || Callee->isDeclaration()) continue; + if (!Callee || Callee->isDeclaration()) continue; // If this call site was obtained by inlining another function, verify // that the include path for the function did not include the callee @@ -497,18 +542,37 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { InlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory)) continue; - + LLVMContext &CallerCtx = Caller->getContext(); + + // Get DebugLoc to report. CS will be invalid after Inliner. + DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); + // If the policy determines that we should inline this function, // try to do so. - if (!shouldInline(CS)) + if (!shouldInline(CS)) { + emitOptimizationRemarkMissed(CallerCtx, DEBUG_TYPE, *Caller, DLoc, + Twine(Callee->getName() + + " will not be inlined into " + + Caller->getName())); continue; + } // Attempt to inline the function. if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas, - InlineHistoryID, InsertLifetime, TD)) + InlineHistoryID, InsertLifetime, DL)) { + emitOptimizationRemarkMissed(CallerCtx, DEBUG_TYPE, *Caller, DLoc, + Twine(Callee->getName() + + " will not be inlined into " + + Caller->getName())); continue; + } ++NumInlined; - + + // Report the inline decision. + emitOptimizationRemark( + CallerCtx, DEBUG_TYPE, *Caller, DLoc, + Twine(Callee->getName() + " inlined into " + Caller->getName())); + // If inlining this function gave us any new call sites, throw them // onto our worklist to process. They are useful inline candidates. if (!InlineInfo.InlinedCalls.empty()) { diff --git a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp index 64e2ced..c970a1a 100644 --- a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp +++ b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp @@ -19,7 +19,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "internalize" #include "llvm/Transforms/IPO.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" @@ -35,6 +34,8 @@ #include <set> using namespace llvm; +#define DEBUG_TYPE "internalize" + STATISTIC(NumAliases , "Number of aliases internalized"); STATISTIC(NumFunctions, "Number of functions internalized"); STATISTIC(NumGlobals , "Number of global vars internalized"); @@ -59,11 +60,11 @@ namespace { explicit InternalizePass(); explicit InternalizePass(ArrayRef<const char *> ExportList); void LoadFile(const char *Filename); - virtual bool runOnModule(Module &M); + bool runOnModule(Module &M) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addPreserved<CallGraph>(); + AU.addPreserved<CallGraphWrapperPass>(); } }; } // end anonymous namespace @@ -72,8 +73,7 @@ char InternalizePass::ID = 0; INITIALIZE_PASS(InternalizePass, "internalize", "Internalize Global Symbols", false, false) -InternalizePass::InternalizePass() - : ModulePass(ID) { +InternalizePass::InternalizePass() : ModulePass(ID) { initializeInternalizePassPass(*PassRegistry::getPassRegistry()); if (!APIFile.empty()) // If a filename is specified, use it. LoadFile(APIFile.c_str()); @@ -81,7 +81,7 @@ InternalizePass::InternalizePass() } InternalizePass::InternalizePass(ArrayRef<const char *> ExportList) - : ModulePass(ID){ + : ModulePass(ID) { initializeInternalizePassPass(*PassRegistry::getPassRegistry()); for(ArrayRef<const char *>::const_iterator itr = ExportList.begin(); itr != ExportList.end(); itr++) { @@ -115,6 +115,10 @@ static bool shouldInternalize(const GlobalValue &GV, if (GV.hasAvailableExternallyLinkage()) return false; + // Assume that dllexported symbols are referenced elsewhere + if (GV.hasDLLExportStorageClass()) + return false; + // Already has internal linkage if (GV.hasLocalLinkage()) return false; @@ -127,8 +131,9 @@ static bool shouldInternalize(const GlobalValue &GV, } bool InternalizePass::runOnModule(Module &M) { - CallGraph *CG = getAnalysisIfAvailable<CallGraph>(); - CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : 0; + CallGraphWrapperPass *CGPass = getAnalysisIfAvailable<CallGraphWrapperPass>(); + CallGraph *CG = CGPass ? &CGPass->getCallGraph() : nullptr; + CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : nullptr; bool Changed = false; SmallPtrSet<GlobalValue *, 8> Used; @@ -150,11 +155,11 @@ bool InternalizePass::runOnModule(Module &M) { } // Mark all functions not in the api as internal. - // FIXME: maybe use private linkage? for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { if (!shouldInternalize(*I, ExternalNames)) continue; + I->setVisibility(GlobalValue::DefaultVisibility); I->setLinkage(GlobalValue::InternalLinkage); if (ExternalNode) @@ -186,12 +191,12 @@ bool InternalizePass::runOnModule(Module &M) { // Mark all global variables with initializers that are not in the api as // internal as well. - // FIXME: maybe use private linkage? for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; ++I) { if (!shouldInternalize(*I, ExternalNames)) continue; + I->setVisibility(GlobalValue::DefaultVisibility); I->setLinkage(GlobalValue::InternalLinkage); Changed = true; ++NumGlobals; @@ -204,6 +209,7 @@ bool InternalizePass::runOnModule(Module &M) { if (!shouldInternalize(*I, ExternalNames)) continue; + I->setVisibility(GlobalValue::DefaultVisibility); I->setLinkage(GlobalValue::InternalLinkage); Changed = true; ++NumAliases; @@ -213,9 +219,7 @@ bool InternalizePass::runOnModule(Module &M) { return Changed; } -ModulePass *llvm::createInternalizePass() { - return new InternalizePass(); -} +ModulePass *llvm::createInternalizePass() { return new InternalizePass(); } ModulePass *llvm::createInternalizePass(ArrayRef<const char *> ExportList) { return new InternalizePass(ExportList); diff --git a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp index 8282a8e..20414aa 100644 --- a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp +++ b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp @@ -14,11 +14,10 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-extract" #include "llvm/Transforms/IPO.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" @@ -30,6 +29,8 @@ #include <set> using namespace llvm; +#define DEBUG_TYPE "loop-extract" + STATISTIC(NumExtracted, "Number of loops extracted"); namespace { @@ -42,12 +43,12 @@ namespace { initializeLoopExtractorPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequiredID(BreakCriticalEdgesID); AU.addRequiredID(LoopSimplifyID); - AU.addRequired<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); } }; } @@ -57,7 +58,7 @@ INITIALIZE_PASS_BEGIN(LoopExtractor, "loop-extract", "Extract loops into new functions", false, false) INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(LoopExtractor, "loop-extract", "Extract loops into new functions", false, false) @@ -79,6 +80,9 @@ INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single", Pass *llvm::createLoopExtractorPass() { return new LoopExtractor(); } bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) + return false; + // Only visit top-level loops. if (L->getParentLoop()) return false; @@ -87,7 +91,7 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { if (!L->isLoopSimplifyForm()) return false; - DominatorTree &DT = getAnalysis<DominatorTree>(); + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); bool Changed = false; // If there is more than one top-level loop in this function, extract all of @@ -133,7 +137,7 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { if (NumLoops == 0) return Changed; --NumLoops; CodeExtractor Extractor(DT, *L); - if (Extractor.extractCodeRegion() != 0) { + if (Extractor.extractCodeRegion() != nullptr) { Changed = true; // After extraction, the loop is replaced by a function call, so // we shouldn't try to run any more loop passes on it. @@ -177,7 +181,7 @@ namespace { LoadFile(BlockFile.c_str()); } - bool runOnModule(Module &M); + bool runOnModule(Module &M) override; }; } @@ -238,7 +242,7 @@ void BlockExtractorPass::SplitLandingPadPreds(Function *F) { if (!Split) continue; SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", 0, NewBBs); + SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", nullptr, NewBBs); } } diff --git a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp index 3861421..2fb0ddb 100644 --- a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp +++ b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp @@ -9,13 +9,24 @@ // // This pass looks for equivalent functions that are mergable and folds them. // -// A hash is computed from the function, based on its type and number of -// basic blocks. +// Order relation is defined on set of functions. It was made through +// special function comparison procedure that returns +// 0 when functions are equal, +// -1 when Left function is less than right function, and +// 1 for opposite case. We need total-ordering, so we need to maintain +// four properties on the functions set: +// a <= a (reflexivity) +// if a <= b and b <= a then a = b (antisymmetry) +// if a <= b and b <= c then a <= c (transitivity). +// for all a and b: a <= b or b <= a (totality). // -// Once all hashes are computed, we perform an expensive equality comparison -// on each function pair. This takes n^2/2 comparisons per bucket, so it's -// important that the hash function be high quality. The equality comparison -// iterates through each instruction in each basic block. +// Comparison iterates through each instruction in each basic block. +// Functions are kept on binary tree. For each new function F we perform +// lookup in binary tree. +// In practice it works the following way: +// -- We define Function* container class with custom "operator<" (FunctionPtr). +// -- "FunctionPtr" instances are stored in std::set collection, so every +// std::set::insert operation will give you result in log(N) time. // // When a match is found the functions are folded. If both functions are // overridable, we move the functionality into a new internal function and @@ -31,9 +42,6 @@ // the object they belong to. However, as long as it's only used for a lookup // and call, this is irrelevant, and we'd like to fold such functions. // -// * switch from n^2 pair-wise comparisons to an n-way comparison for each -// bucket. -// // * be smarter about bitcasts. // // In order to fold functions, we will sometimes add either bitcast instructions @@ -41,15 +49,45 @@ // analysis since the two functions differ where one has a bitcast and the // other doesn't. We should learn to look through bitcasts. // +// * Compare complex types with pointer types inside. +// * Compare cross-reference cases. +// * Compare complex expressions. +// +// All the three issues above could be described as ability to prove that +// fA == fB == fC == fE == fF == fG in example below: +// +// void fA() { +// fB(); +// } +// void fB() { +// fA(); +// } +// +// void fE() { +// fF(); +// } +// void fF() { +// fG(); +// } +// void fG() { +// fE(); +// } +// +// Simplest cross-reference case (fA <--> fB) was implemented in previous +// versions of MergeFunctions, though it presented only in two function pairs +// in test-suite (that counts >50k functions) +// Though possibility to detect complex cross-referencing (e.g.: A->B->C->D->A) +// could cover much more cases. +// //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "mergefunc" #include "llvm/Transforms/IPO.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IRBuilder.h" @@ -58,103 +96,28 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" -#include "llvm/Support/CallSite.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include <vector> using namespace llvm; +#define DEBUG_TYPE "mergefunc" + STATISTIC(NumFunctionsMerged, "Number of functions merged"); STATISTIC(NumThunksWritten, "Number of thunks generated"); STATISTIC(NumAliasesWritten, "Number of aliases generated"); STATISTIC(NumDoubleWeak, "Number of new functions created"); -/// Returns the type id for a type to be hashed. We turn pointer types into -/// integers here because the actual compare logic below considers pointers and -/// integers of the same size as equal. -static Type::TypeID getTypeIDForHash(Type *Ty) { - if (Ty->isPointerTy()) - return Type::IntegerTyID; - return Ty->getTypeID(); -} - -/// Creates a hash-code for the function which is the same for any two -/// functions that will compare equal, without looking at the instructions -/// inside the function. -static unsigned profileFunction(const Function *F) { - FunctionType *FTy = F->getFunctionType(); - - FoldingSetNodeID ID; - ID.AddInteger(F->size()); - ID.AddInteger(F->getCallingConv()); - ID.AddBoolean(F->hasGC()); - ID.AddBoolean(FTy->isVarArg()); - ID.AddInteger(getTypeIDForHash(FTy->getReturnType())); - for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) - ID.AddInteger(getTypeIDForHash(FTy->getParamType(i))); - return ID.ComputeHash(); -} - -namespace { - -/// ComparableFunction - A struct that pairs together functions with a -/// DataLayout so that we can keep them together as elements in the DenseSet. -class ComparableFunction { -public: - static const ComparableFunction EmptyKey; - static const ComparableFunction TombstoneKey; - static DataLayout * const LookupOnly; - - ComparableFunction(Function *Func, DataLayout *TD) - : Func(Func), Hash(profileFunction(Func)), TD(TD) {} - - Function *getFunc() const { return Func; } - unsigned getHash() const { return Hash; } - DataLayout *getTD() const { return TD; } - - // Drops AssertingVH reference to the function. Outside of debug mode, this - // does nothing. - void release() { - assert(Func && - "Attempted to release function twice, or release empty/tombstone!"); - Func = NULL; - } - -private: - explicit ComparableFunction(unsigned Hash) - : Func(NULL), Hash(Hash), TD(NULL) {} - - AssertingVH<Function> Func; - unsigned Hash; - DataLayout *TD; -}; - -const ComparableFunction ComparableFunction::EmptyKey = ComparableFunction(0); -const ComparableFunction ComparableFunction::TombstoneKey = - ComparableFunction(1); -DataLayout *const ComparableFunction::LookupOnly = (DataLayout*)(-1); - -} - -namespace llvm { - template <> - struct DenseMapInfo<ComparableFunction> { - static ComparableFunction getEmptyKey() { - return ComparableFunction::EmptyKey; - } - static ComparableFunction getTombstoneKey() { - return ComparableFunction::TombstoneKey; - } - static unsigned getHashValue(const ComparableFunction &CF) { - return CF.getHash(); - } - static bool isEqual(const ComparableFunction &LHS, - const ComparableFunction &RHS); - }; -} +static cl::opt<unsigned> NumFunctionsForSanityCheck( + "mergefunc-sanity", + cl::desc("How many functions in module could be used for " + "MergeFunctions pass sanity check. " + "'0' disables this check. Works only with '-debug' key."), + cl::init(0), cl::Hidden); namespace { @@ -164,75 +127,518 @@ namespace { /// side of claiming that two functions are different). class FunctionComparator { public: - FunctionComparator(const DataLayout *TD, const Function *F1, + FunctionComparator(const DataLayout *DL, const Function *F1, const Function *F2) - : F1(F1), F2(F2), TD(TD) {} + : FnL(F1), FnR(F2), DL(DL) {} /// Test whether the two functions have equivalent behaviour. - bool compare(); + int compare(); private: /// Test whether two basic blocks have equivalent behaviour. - bool compare(const BasicBlock *BB1, const BasicBlock *BB2); + int compare(const BasicBlock *BBL, const BasicBlock *BBR); + + /// Constants comparison. + /// Its analog to lexicographical comparison between hypothetical numbers + /// of next format: + /// <bitcastability-trait><raw-bit-contents> + /// + /// 1. Bitcastability. + /// Check whether L's type could be losslessly bitcasted to R's type. + /// On this stage method, in case when lossless bitcast is not possible + /// method returns -1 or 1, thus also defining which type is greater in + /// context of bitcastability. + /// Stage 0: If types are equal in terms of cmpTypes, then we can go straight + /// to the contents comparison. + /// If types differ, remember types comparison result and check + /// whether we still can bitcast types. + /// Stage 1: Types that satisfies isFirstClassType conditions are always + /// greater then others. + /// Stage 2: Vector is greater then non-vector. + /// If both types are vectors, then vector with greater bitwidth is + /// greater. + /// If both types are vectors with the same bitwidth, then types + /// are bitcastable, and we can skip other stages, and go to contents + /// comparison. + /// Stage 3: Pointer types are greater than non-pointers. If both types are + /// pointers of the same address space - go to contents comparison. + /// Different address spaces: pointer with greater address space is + /// greater. + /// Stage 4: Types are neither vectors, nor pointers. And they differ. + /// We don't know how to bitcast them. So, we better don't do it, + /// and return types comparison result (so it determines the + /// relationship among constants we don't know how to bitcast). + /// + /// Just for clearance, let's see how the set of constants could look + /// on single dimension axis: + /// + /// [NFCT], [FCT, "others"], [FCT, pointers], [FCT, vectors] + /// Where: NFCT - Not a FirstClassType + /// FCT - FirstClassTyp: + /// + /// 2. Compare raw contents. + /// It ignores types on this stage and only compares bits from L and R. + /// Returns 0, if L and R has equivalent contents. + /// -1 or 1 if values are different. + /// Pretty trivial: + /// 2.1. If contents are numbers, compare numbers. + /// Ints with greater bitwidth are greater. Ints with same bitwidths + /// compared by their contents. + /// 2.2. "And so on". Just to avoid discrepancies with comments + /// perhaps it would be better to read the implementation itself. + /// 3. And again about overall picture. Let's look back at how the ordered set + /// of constants will look like: + /// [NFCT], [FCT, "others"], [FCT, pointers], [FCT, vectors] + /// + /// Now look, what could be inside [FCT, "others"], for example: + /// [FCT, "others"] = + /// [ + /// [double 0.1], [double 1.23], + /// [i32 1], [i32 2], + /// { double 1.0 }, ; StructTyID, NumElements = 1 + /// { i32 1 }, ; StructTyID, NumElements = 1 + /// { double 1, i32 1 }, ; StructTyID, NumElements = 2 + /// { i32 1, double 1 } ; StructTyID, NumElements = 2 + /// ] + /// + /// Let's explain the order. Float numbers will be less than integers, just + /// because of cmpType terms: FloatTyID < IntegerTyID. + /// Floats (with same fltSemantics) are sorted according to their value. + /// Then you can see integers, and they are, like a floats, + /// could be easy sorted among each others. + /// The structures. Structures are grouped at the tail, again because of their + /// TypeID: StructTyID > IntegerTyID > FloatTyID. + /// Structures with greater number of elements are greater. Structures with + /// greater elements going first are greater. + /// The same logic with vectors, arrays and other possible complex types. + /// + /// Bitcastable constants. + /// Let's assume, that some constant, belongs to some group of + /// "so-called-equal" values with different types, and at the same time + /// belongs to another group of constants with equal types + /// and "really" equal values. + /// + /// Now, prove that this is impossible: + /// + /// If constant A with type TyA is bitcastable to B with type TyB, then: + /// 1. All constants with equal types to TyA, are bitcastable to B. Since + /// those should be vectors (if TyA is vector), pointers + /// (if TyA is pointer), or else (if TyA equal to TyB), those types should + /// be equal to TyB. + /// 2. All constants with non-equal, but bitcastable types to TyA, are + /// bitcastable to B. + /// Once again, just because we allow it to vectors and pointers only. + /// This statement could be expanded as below: + /// 2.1. All vectors with equal bitwidth to vector A, has equal bitwidth to + /// vector B, and thus bitcastable to B as well. + /// 2.2. All pointers of the same address space, no matter what they point to, + /// bitcastable. So if C is pointer, it could be bitcasted to A and to B. + /// So any constant equal or bitcastable to A is equal or bitcastable to B. + /// QED. + /// + /// In another words, for pointers and vectors, we ignore top-level type and + /// look at their particular properties (bit-width for vectors, and + /// address space for pointers). + /// If these properties are equal - compare their contents. + int cmpConstants(const Constant *L, const Constant *R); /// Assign or look up previously assigned numbers for the two values, and /// return whether the numbers are equal. Numbers are assigned in the order /// visited. - bool enumerate(const Value *V1, const Value *V2); + /// Comparison order: + /// Stage 0: Value that is function itself is always greater then others. + /// If left and right values are references to their functions, then + /// they are equal. + /// Stage 1: Constants are greater than non-constants. + /// If both left and right are constants, then the result of + /// cmpConstants is used as cmpValues result. + /// Stage 2: InlineAsm instances are greater than others. If both left and + /// right are InlineAsm instances, InlineAsm* pointers casted to + /// integers and compared as numbers. + /// Stage 3: For all other cases we compare order we meet these values in + /// their functions. If right value was met first during scanning, + /// then left value is greater. + /// In another words, we compare serial numbers, for more details + /// see comments for sn_mapL and sn_mapR. + int cmpValues(const Value *L, const Value *R); /// Compare two Instructions for equivalence, similar to /// Instruction::isSameOperationAs but with modifications to the type /// comparison. - bool isEquivalentOperation(const Instruction *I1, - const Instruction *I2) const; + /// Stages are listed in "most significant stage first" order: + /// On each stage below, we do comparison between some left and right + /// operation parts. If parts are non-equal, we assign parts comparison + /// result to the operation comparison result and exit from method. + /// Otherwise we proceed to the next stage. + /// Stages: + /// 1. Operations opcodes. Compared as numbers. + /// 2. Number of operands. + /// 3. Operation types. Compared with cmpType method. + /// 4. Compare operation subclass optional data as stream of bytes: + /// just convert it to integers and call cmpNumbers. + /// 5. Compare in operation operand types with cmpType in + /// most significant operand first order. + /// 6. Last stage. Check operations for some specific attributes. + /// For example, for Load it would be: + /// 6.1.Load: volatile (as boolean flag) + /// 6.2.Load: alignment (as integer numbers) + /// 6.3.Load: synch-scope (as integer numbers) + /// 6.4.Load: range metadata (as integer numbers) + /// On this stage its better to see the code, since its not more than 10-15 + /// strings for particular instruction, and could change sometimes. + int cmpOperation(const Instruction *L, const Instruction *R) const; /// Compare two GEPs for equivalent pointer arithmetic. - bool isEquivalentGEP(const GEPOperator *GEP1, const GEPOperator *GEP2); - bool isEquivalentGEP(const GetElementPtrInst *GEP1, - const GetElementPtrInst *GEP2) { - return isEquivalentGEP(cast<GEPOperator>(GEP1), cast<GEPOperator>(GEP2)); + /// Parts to be compared for each comparison stage, + /// most significant stage first: + /// 1. Address space. As numbers. + /// 2. Constant offset, (if "DataLayout *DL" field is not NULL, + /// using GEPOperator::accumulateConstantOffset method). + /// 3. Pointer operand type (using cmpType method). + /// 4. Number of operands. + /// 5. Compare operands, using cmpValues method. + int cmpGEP(const GEPOperator *GEPL, const GEPOperator *GEPR); + int cmpGEP(const GetElementPtrInst *GEPL, const GetElementPtrInst *GEPR) { + return cmpGEP(cast<GEPOperator>(GEPL), cast<GEPOperator>(GEPR)); } - /// Compare two Types, treating all pointer types as equal. - bool isEquivalentType(Type *Ty1, Type *Ty2) const; + /// cmpType - compares two types, + /// defines total ordering among the types set. + /// + /// Return values: + /// 0 if types are equal, + /// -1 if Left is less than Right, + /// +1 if Left is greater than Right. + /// + /// Description: + /// Comparison is broken onto stages. Like in lexicographical comparison + /// stage coming first has higher priority. + /// On each explanation stage keep in mind total ordering properties. + /// + /// 0. Before comparison we coerce pointer types of 0 address space to + /// integer. + /// We also don't bother with same type at left and right, so + /// just return 0 in this case. + /// + /// 1. If types are of different kind (different type IDs). + /// Return result of type IDs comparison, treating them as numbers. + /// 2. If types are vectors or integers, compare Type* values as numbers. + /// 3. Types has same ID, so check whether they belongs to the next group: + /// * Void + /// * Float + /// * Double + /// * X86_FP80 + /// * FP128 + /// * PPC_FP128 + /// * Label + /// * Metadata + /// If so - return 0, yes - we can treat these types as equal only because + /// their IDs are same. + /// 4. If Left and Right are pointers, return result of address space + /// comparison (numbers comparison). We can treat pointer types of same + /// address space as equal. + /// 5. If types are complex. + /// Then both Left and Right are to be expanded and their element types will + /// be checked with the same way. If we get Res != 0 on some stage, return it. + /// Otherwise return 0. + /// 6. For all other cases put llvm_unreachable. + int cmpType(Type *TyL, Type *TyR) const; + + int cmpNumbers(uint64_t L, uint64_t R) const; + + int cmpAPInt(const APInt &L, const APInt &R) const; + int cmpAPFloat(const APFloat &L, const APFloat &R) const; + int cmpStrings(StringRef L, StringRef R) const; + int cmpAttrs(const AttributeSet L, const AttributeSet R) const; // The two functions undergoing comparison. - const Function *F1, *F2; + const Function *FnL, *FnR; + + const DataLayout *DL; + + /// Assign serial numbers to values from left function, and values from + /// right function. + /// Explanation: + /// Being comparing functions we need to compare values we meet at left and + /// right sides. + /// Its easy to sort things out for external values. It just should be + /// the same value at left and right. + /// But for local values (those were introduced inside function body) + /// we have to ensure they were introduced at exactly the same place, + /// and plays the same role. + /// Let's assign serial number to each value when we meet it first time. + /// Values that were met at same place will be with same serial numbers. + /// In this case it would be good to explain few points about values assigned + /// to BBs and other ways of implementation (see below). + /// + /// 1. Safety of BB reordering. + /// It's safe to change the order of BasicBlocks in function. + /// Relationship with other functions and serial numbering will not be + /// changed in this case. + /// As follows from FunctionComparator::compare(), we do CFG walk: we start + /// from the entry, and then take each terminator. So it doesn't matter how in + /// fact BBs are ordered in function. And since cmpValues are called during + /// this walk, the numbering depends only on how BBs located inside the CFG. + /// So the answer is - yes. We will get the same numbering. + /// + /// 2. Impossibility to use dominance properties of values. + /// If we compare two instruction operands: first is usage of local + /// variable AL from function FL, and second is usage of local variable AR + /// from FR, we could compare their origins and check whether they are + /// defined at the same place. + /// But, we are still not able to compare operands of PHI nodes, since those + /// could be operands from further BBs we didn't scan yet. + /// So it's impossible to use dominance properties in general. + DenseMap<const Value*, int> sn_mapL, sn_mapR; +}; - const DataLayout *TD; +class FunctionPtr { + AssertingVH<Function> F; + const DataLayout *DL; - DenseMap<const Value *, const Value *> id_map; - DenseSet<const Value *> seen_values; +public: + FunctionPtr(Function *F, const DataLayout *DL) : F(F), DL(DL) {} + Function *getFunc() const { return F; } + void release() { F = 0; } + bool operator<(const FunctionPtr &RHS) const { + return (FunctionComparator(DL, F, RHS.getFunc()).compare()) == -1; + } }; +} + +int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const { + if (L < R) return -1; + if (L > R) return 1; + return 0; +} +int FunctionComparator::cmpAPInt(const APInt &L, const APInt &R) const { + if (int Res = cmpNumbers(L.getBitWidth(), R.getBitWidth())) + return Res; + if (L.ugt(R)) return 1; + if (R.ugt(L)) return -1; + return 0; } -// Any two pointers in the same address space are equivalent, intptr_t and -// pointers are equivalent. Otherwise, standard type equivalence rules apply. -bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const { +int FunctionComparator::cmpAPFloat(const APFloat &L, const APFloat &R) const { + if (int Res = cmpNumbers((uint64_t)&L.getSemantics(), + (uint64_t)&R.getSemantics())) + return Res; + return cmpAPInt(L.bitcastToAPInt(), R.bitcastToAPInt()); +} - PointerType *PTy1 = dyn_cast<PointerType>(Ty1); - PointerType *PTy2 = dyn_cast<PointerType>(Ty2); +int FunctionComparator::cmpStrings(StringRef L, StringRef R) const { + // Prevent heavy comparison, compare sizes first. + if (int Res = cmpNumbers(L.size(), R.size())) + return Res; - if (TD) { - if (PTy1 && PTy1->getAddressSpace() == 0) Ty1 = TD->getIntPtrType(Ty1); - if (PTy2 && PTy2->getAddressSpace() == 0) Ty2 = TD->getIntPtrType(Ty2); + // Compare strings lexicographically only when it is necessary: only when + // strings are equal in size. + return L.compare(R); +} + +int FunctionComparator::cmpAttrs(const AttributeSet L, + const AttributeSet R) const { + if (int Res = cmpNumbers(L.getNumSlots(), R.getNumSlots())) + return Res; + + for (unsigned i = 0, e = L.getNumSlots(); i != e; ++i) { + AttributeSet::iterator LI = L.begin(i), LE = L.end(i), RI = R.begin(i), + RE = R.end(i); + for (; LI != LE && RI != RE; ++LI, ++RI) { + Attribute LA = *LI; + Attribute RA = *RI; + if (LA < RA) + return -1; + if (RA < LA) + return 1; + } + if (LI != LE) + return 1; + if (RI != RE) + return -1; } + return 0; +} - if (Ty1 == Ty2) - return true; +/// Constants comparison: +/// 1. Check whether type of L constant could be losslessly bitcasted to R +/// type. +/// 2. Compare constant contents. +/// For more details see declaration comments. +int FunctionComparator::cmpConstants(const Constant *L, const Constant *R) { + + Type *TyL = L->getType(); + Type *TyR = R->getType(); + + // Check whether types are bitcastable. This part is just re-factored + // Type::canLosslesslyBitCastTo method, but instead of returning true/false, + // we also pack into result which type is "less" for us. + int TypesRes = cmpType(TyL, TyR); + if (TypesRes != 0) { + // Types are different, but check whether we can bitcast them. + if (!TyL->isFirstClassType()) { + if (TyR->isFirstClassType()) + return -1; + // Neither TyL nor TyR are values of first class type. Return the result + // of comparing the types + return TypesRes; + } + if (!TyR->isFirstClassType()) { + if (TyL->isFirstClassType()) + return 1; + return TypesRes; + } - if (Ty1->getTypeID() != Ty2->getTypeID()) - return false; + // Vector -> Vector conversions are always lossless if the two vector types + // have the same size, otherwise not. + unsigned TyLWidth = 0; + unsigned TyRWidth = 0; + + if (const VectorType *VecTyL = dyn_cast<VectorType>(TyL)) + TyLWidth = VecTyL->getBitWidth(); + if (const VectorType *VecTyR = dyn_cast<VectorType>(TyR)) + TyRWidth = VecTyR->getBitWidth(); + + if (TyLWidth != TyRWidth) + return cmpNumbers(TyLWidth, TyRWidth); + + // Zero bit-width means neither TyL nor TyR are vectors. + if (!TyLWidth) { + PointerType *PTyL = dyn_cast<PointerType>(TyL); + PointerType *PTyR = dyn_cast<PointerType>(TyR); + if (PTyL && PTyR) { + unsigned AddrSpaceL = PTyL->getAddressSpace(); + unsigned AddrSpaceR = PTyR->getAddressSpace(); + if (int Res = cmpNumbers(AddrSpaceL, AddrSpaceR)) + return Res; + } + if (PTyL) + return 1; + if (PTyR) + return -1; + + // TyL and TyR aren't vectors, nor pointers. We don't know how to + // bitcast them. + return TypesRes; + } + } + + // OK, types are bitcastable, now check constant contents. + + if (L->isNullValue() && R->isNullValue()) + return TypesRes; + if (L->isNullValue() && !R->isNullValue()) + return 1; + if (!L->isNullValue() && R->isNullValue()) + return -1; + + if (int Res = cmpNumbers(L->getValueID(), R->getValueID())) + return Res; + + switch (L->getValueID()) { + case Value::UndefValueVal: return TypesRes; + case Value::ConstantIntVal: { + const APInt &LInt = cast<ConstantInt>(L)->getValue(); + const APInt &RInt = cast<ConstantInt>(R)->getValue(); + return cmpAPInt(LInt, RInt); + } + case Value::ConstantFPVal: { + const APFloat &LAPF = cast<ConstantFP>(L)->getValueAPF(); + const APFloat &RAPF = cast<ConstantFP>(R)->getValueAPF(); + return cmpAPFloat(LAPF, RAPF); + } + case Value::ConstantArrayVal: { + const ConstantArray *LA = cast<ConstantArray>(L); + const ConstantArray *RA = cast<ConstantArray>(R); + uint64_t NumElementsL = cast<ArrayType>(TyL)->getNumElements(); + uint64_t NumElementsR = cast<ArrayType>(TyR)->getNumElements(); + if (int Res = cmpNumbers(NumElementsL, NumElementsR)) + return Res; + for (uint64_t i = 0; i < NumElementsL; ++i) { + if (int Res = cmpConstants(cast<Constant>(LA->getOperand(i)), + cast<Constant>(RA->getOperand(i)))) + return Res; + } + return 0; + } + case Value::ConstantStructVal: { + const ConstantStruct *LS = cast<ConstantStruct>(L); + const ConstantStruct *RS = cast<ConstantStruct>(R); + unsigned NumElementsL = cast<StructType>(TyL)->getNumElements(); + unsigned NumElementsR = cast<StructType>(TyR)->getNumElements(); + if (int Res = cmpNumbers(NumElementsL, NumElementsR)) + return Res; + for (unsigned i = 0; i != NumElementsL; ++i) { + if (int Res = cmpConstants(cast<Constant>(LS->getOperand(i)), + cast<Constant>(RS->getOperand(i)))) + return Res; + } + return 0; + } + case Value::ConstantVectorVal: { + const ConstantVector *LV = cast<ConstantVector>(L); + const ConstantVector *RV = cast<ConstantVector>(R); + unsigned NumElementsL = cast<VectorType>(TyL)->getNumElements(); + unsigned NumElementsR = cast<VectorType>(TyR)->getNumElements(); + if (int Res = cmpNumbers(NumElementsL, NumElementsR)) + return Res; + for (uint64_t i = 0; i < NumElementsL; ++i) { + if (int Res = cmpConstants(cast<Constant>(LV->getOperand(i)), + cast<Constant>(RV->getOperand(i)))) + return Res; + } + return 0; + } + case Value::ConstantExprVal: { + const ConstantExpr *LE = cast<ConstantExpr>(L); + const ConstantExpr *RE = cast<ConstantExpr>(R); + unsigned NumOperandsL = LE->getNumOperands(); + unsigned NumOperandsR = RE->getNumOperands(); + if (int Res = cmpNumbers(NumOperandsL, NumOperandsR)) + return Res; + for (unsigned i = 0; i < NumOperandsL; ++i) { + if (int Res = cmpConstants(cast<Constant>(LE->getOperand(i)), + cast<Constant>(RE->getOperand(i)))) + return Res; + } + return 0; + } + case Value::FunctionVal: + case Value::GlobalVariableVal: + case Value::GlobalAliasVal: + default: // Unknown constant, cast L and R pointers to numbers and compare. + return cmpNumbers((uint64_t)L, (uint64_t)R); + } +} + +/// cmpType - compares two types, +/// defines total ordering among the types set. +/// See method declaration comments for more details. +int FunctionComparator::cmpType(Type *TyL, Type *TyR) const { - switch (Ty1->getTypeID()) { + PointerType *PTyL = dyn_cast<PointerType>(TyL); + PointerType *PTyR = dyn_cast<PointerType>(TyR); + + if (DL) { + if (PTyL && PTyL->getAddressSpace() == 0) TyL = DL->getIntPtrType(TyL); + if (PTyR && PTyR->getAddressSpace() == 0) TyR = DL->getIntPtrType(TyR); + } + + if (TyL == TyR) + return 0; + + if (int Res = cmpNumbers(TyL->getTypeID(), TyR->getTypeID())) + return Res; + + switch (TyL->getTypeID()) { default: llvm_unreachable("Unknown type!"); // Fall through in Release mode. case Type::IntegerTyID: case Type::VectorTyID: - // Ty1 == Ty2 would have returned true earlier. - return false; + // TyL == TyR would have returned true earlier. + return cmpNumbers((uint64_t)TyL, (uint64_t)TyR); case Type::VoidTyID: case Type::FloatTyID: @@ -242,51 +648,55 @@ bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const { case Type::PPC_FP128TyID: case Type::LabelTyID: case Type::MetadataTyID: - return true; + return 0; case Type::PointerTyID: { - assert(PTy1 && PTy2 && "Both types must be pointers here."); - return PTy1->getAddressSpace() == PTy2->getAddressSpace(); + assert(PTyL && PTyR && "Both types must be pointers here."); + return cmpNumbers(PTyL->getAddressSpace(), PTyR->getAddressSpace()); } case Type::StructTyID: { - StructType *STy1 = cast<StructType>(Ty1); - StructType *STy2 = cast<StructType>(Ty2); - if (STy1->getNumElements() != STy2->getNumElements()) - return false; - - if (STy1->isPacked() != STy2->isPacked()) - return false; - - for (unsigned i = 0, e = STy1->getNumElements(); i != e; ++i) { - if (!isEquivalentType(STy1->getElementType(i), STy2->getElementType(i))) - return false; + StructType *STyL = cast<StructType>(TyL); + StructType *STyR = cast<StructType>(TyR); + if (STyL->getNumElements() != STyR->getNumElements()) + return cmpNumbers(STyL->getNumElements(), STyR->getNumElements()); + + if (STyL->isPacked() != STyR->isPacked()) + return cmpNumbers(STyL->isPacked(), STyR->isPacked()); + + for (unsigned i = 0, e = STyL->getNumElements(); i != e; ++i) { + if (int Res = cmpType(STyL->getElementType(i), + STyR->getElementType(i))) + return Res; } - return true; + return 0; } case Type::FunctionTyID: { - FunctionType *FTy1 = cast<FunctionType>(Ty1); - FunctionType *FTy2 = cast<FunctionType>(Ty2); - if (FTy1->getNumParams() != FTy2->getNumParams() || - FTy1->isVarArg() != FTy2->isVarArg()) - return false; + FunctionType *FTyL = cast<FunctionType>(TyL); + FunctionType *FTyR = cast<FunctionType>(TyR); + if (FTyL->getNumParams() != FTyR->getNumParams()) + return cmpNumbers(FTyL->getNumParams(), FTyR->getNumParams()); - if (!isEquivalentType(FTy1->getReturnType(), FTy2->getReturnType())) - return false; + if (FTyL->isVarArg() != FTyR->isVarArg()) + return cmpNumbers(FTyL->isVarArg(), FTyR->isVarArg()); + + if (int Res = cmpType(FTyL->getReturnType(), FTyR->getReturnType())) + return Res; - for (unsigned i = 0, e = FTy1->getNumParams(); i != e; ++i) { - if (!isEquivalentType(FTy1->getParamType(i), FTy2->getParamType(i))) - return false; + for (unsigned i = 0, e = FTyL->getNumParams(); i != e; ++i) { + if (int Res = cmpType(FTyL->getParamType(i), FTyR->getParamType(i))) + return Res; } - return true; + return 0; } case Type::ArrayTyID: { - ArrayType *ATy1 = cast<ArrayType>(Ty1); - ArrayType *ATy2 = cast<ArrayType>(Ty2); - return ATy1->getNumElements() == ATy2->getNumElements() && - isEquivalentType(ATy1->getElementType(), ATy2->getElementType()); + ArrayType *ATyL = cast<ArrayType>(TyL); + ArrayType *ATyR = cast<ArrayType>(TyR); + if (ATyL->getNumElements() != ATyR->getNumElements()) + return cmpNumbers(ATyL->getNumElements(), ATyR->getNumElements()); + return cmpType(ATyL->getElementType(), ATyR->getElementType()); } } } @@ -294,222 +704,323 @@ bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const { // Determine whether the two operations are the same except that pointer-to-A // and pointer-to-B are equivalent. This should be kept in sync with // Instruction::isSameOperationAs. -bool FunctionComparator::isEquivalentOperation(const Instruction *I1, - const Instruction *I2) const { +// Read method declaration comments for more details. +int FunctionComparator::cmpOperation(const Instruction *L, + const Instruction *R) const { // Differences from Instruction::isSameOperationAs: // * replace type comparison with calls to isEquivalentType. // * we test for I->hasSameSubclassOptionalData (nuw/nsw/tail) at the top // * because of the above, we don't test for the tail bit on calls later on - if (I1->getOpcode() != I2->getOpcode() || - I1->getNumOperands() != I2->getNumOperands() || - !isEquivalentType(I1->getType(), I2->getType()) || - !I1->hasSameSubclassOptionalData(I2)) - return false; + if (int Res = cmpNumbers(L->getOpcode(), R->getOpcode())) + return Res; + + if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands())) + return Res; + + if (int Res = cmpType(L->getType(), R->getType())) + return Res; + + if (int Res = cmpNumbers(L->getRawSubclassOptionalData(), + R->getRawSubclassOptionalData())) + return Res; // We have two instructions of identical opcode and #operands. Check to see // if all operands are the same type - for (unsigned i = 0, e = I1->getNumOperands(); i != e; ++i) - if (!isEquivalentType(I1->getOperand(i)->getType(), - I2->getOperand(i)->getType())) - return false; + for (unsigned i = 0, e = L->getNumOperands(); i != e; ++i) { + if (int Res = + cmpType(L->getOperand(i)->getType(), R->getOperand(i)->getType())) + return Res; + } // Check special state that is a part of some instructions. - if (const LoadInst *LI = dyn_cast<LoadInst>(I1)) - return LI->isVolatile() == cast<LoadInst>(I2)->isVolatile() && - LI->getAlignment() == cast<LoadInst>(I2)->getAlignment() && - LI->getOrdering() == cast<LoadInst>(I2)->getOrdering() && - LI->getSynchScope() == cast<LoadInst>(I2)->getSynchScope(); - if (const StoreInst *SI = dyn_cast<StoreInst>(I1)) - return SI->isVolatile() == cast<StoreInst>(I2)->isVolatile() && - SI->getAlignment() == cast<StoreInst>(I2)->getAlignment() && - SI->getOrdering() == cast<StoreInst>(I2)->getOrdering() && - SI->getSynchScope() == cast<StoreInst>(I2)->getSynchScope(); - if (const CmpInst *CI = dyn_cast<CmpInst>(I1)) - return CI->getPredicate() == cast<CmpInst>(I2)->getPredicate(); - if (const CallInst *CI = dyn_cast<CallInst>(I1)) - return CI->getCallingConv() == cast<CallInst>(I2)->getCallingConv() && - CI->getAttributes() == cast<CallInst>(I2)->getAttributes(); - if (const InvokeInst *CI = dyn_cast<InvokeInst>(I1)) - return CI->getCallingConv() == cast<InvokeInst>(I2)->getCallingConv() && - CI->getAttributes() == cast<InvokeInst>(I2)->getAttributes(); - if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(I1)) - return IVI->getIndices() == cast<InsertValueInst>(I2)->getIndices(); - if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I1)) - return EVI->getIndices() == cast<ExtractValueInst>(I2)->getIndices(); - if (const FenceInst *FI = dyn_cast<FenceInst>(I1)) - return FI->getOrdering() == cast<FenceInst>(I2)->getOrdering() && - FI->getSynchScope() == cast<FenceInst>(I2)->getSynchScope(); - if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I1)) - return CXI->isVolatile() == cast<AtomicCmpXchgInst>(I2)->isVolatile() && - CXI->getOrdering() == cast<AtomicCmpXchgInst>(I2)->getOrdering() && - CXI->getSynchScope() == cast<AtomicCmpXchgInst>(I2)->getSynchScope(); - if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I1)) - return RMWI->getOperation() == cast<AtomicRMWInst>(I2)->getOperation() && - RMWI->isVolatile() == cast<AtomicRMWInst>(I2)->isVolatile() && - RMWI->getOrdering() == cast<AtomicRMWInst>(I2)->getOrdering() && - RMWI->getSynchScope() == cast<AtomicRMWInst>(I2)->getSynchScope(); + if (const LoadInst *LI = dyn_cast<LoadInst>(L)) { + if (int Res = cmpNumbers(LI->isVolatile(), cast<LoadInst>(R)->isVolatile())) + return Res; + if (int Res = + cmpNumbers(LI->getAlignment(), cast<LoadInst>(R)->getAlignment())) + return Res; + if (int Res = + cmpNumbers(LI->getOrdering(), cast<LoadInst>(R)->getOrdering())) + return Res; + if (int Res = + cmpNumbers(LI->getSynchScope(), cast<LoadInst>(R)->getSynchScope())) + return Res; + return cmpNumbers((uint64_t)LI->getMetadata(LLVMContext::MD_range), + (uint64_t)cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range)); + } + if (const StoreInst *SI = dyn_cast<StoreInst>(L)) { + if (int Res = + cmpNumbers(SI->isVolatile(), cast<StoreInst>(R)->isVolatile())) + return Res; + if (int Res = + cmpNumbers(SI->getAlignment(), cast<StoreInst>(R)->getAlignment())) + return Res; + if (int Res = + cmpNumbers(SI->getOrdering(), cast<StoreInst>(R)->getOrdering())) + return Res; + return cmpNumbers(SI->getSynchScope(), cast<StoreInst>(R)->getSynchScope()); + } + if (const CmpInst *CI = dyn_cast<CmpInst>(L)) + return cmpNumbers(CI->getPredicate(), cast<CmpInst>(R)->getPredicate()); + if (const CallInst *CI = dyn_cast<CallInst>(L)) { + if (int Res = cmpNumbers(CI->getCallingConv(), + cast<CallInst>(R)->getCallingConv())) + return Res; + if (int Res = + cmpAttrs(CI->getAttributes(), cast<CallInst>(R)->getAttributes())) + return Res; + return cmpNumbers( + (uint64_t)CI->getMetadata(LLVMContext::MD_range), + (uint64_t)cast<CallInst>(R)->getMetadata(LLVMContext::MD_range)); + } + if (const InvokeInst *CI = dyn_cast<InvokeInst>(L)) { + if (int Res = cmpNumbers(CI->getCallingConv(), + cast<InvokeInst>(R)->getCallingConv())) + return Res; + if (int Res = + cmpAttrs(CI->getAttributes(), cast<InvokeInst>(R)->getAttributes())) + return Res; + return cmpNumbers( + (uint64_t)CI->getMetadata(LLVMContext::MD_range), + (uint64_t)cast<InvokeInst>(R)->getMetadata(LLVMContext::MD_range)); + } + if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(L)) { + ArrayRef<unsigned> LIndices = IVI->getIndices(); + ArrayRef<unsigned> RIndices = cast<InsertValueInst>(R)->getIndices(); + if (int Res = cmpNumbers(LIndices.size(), RIndices.size())) + return Res; + for (size_t i = 0, e = LIndices.size(); i != e; ++i) { + if (int Res = cmpNumbers(LIndices[i], RIndices[i])) + return Res; + } + } + if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(L)) { + ArrayRef<unsigned> LIndices = EVI->getIndices(); + ArrayRef<unsigned> RIndices = cast<ExtractValueInst>(R)->getIndices(); + if (int Res = cmpNumbers(LIndices.size(), RIndices.size())) + return Res; + for (size_t i = 0, e = LIndices.size(); i != e; ++i) { + if (int Res = cmpNumbers(LIndices[i], RIndices[i])) + return Res; + } + } + if (const FenceInst *FI = dyn_cast<FenceInst>(L)) { + if (int Res = + cmpNumbers(FI->getOrdering(), cast<FenceInst>(R)->getOrdering())) + return Res; + return cmpNumbers(FI->getSynchScope(), cast<FenceInst>(R)->getSynchScope()); + } - return true; + if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(L)) { + if (int Res = cmpNumbers(CXI->isVolatile(), + cast<AtomicCmpXchgInst>(R)->isVolatile())) + return Res; + if (int Res = cmpNumbers(CXI->isWeak(), + cast<AtomicCmpXchgInst>(R)->isWeak())) + return Res; + if (int Res = cmpNumbers(CXI->getSuccessOrdering(), + cast<AtomicCmpXchgInst>(R)->getSuccessOrdering())) + return Res; + if (int Res = cmpNumbers(CXI->getFailureOrdering(), + cast<AtomicCmpXchgInst>(R)->getFailureOrdering())) + return Res; + return cmpNumbers(CXI->getSynchScope(), + cast<AtomicCmpXchgInst>(R)->getSynchScope()); + } + if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(L)) { + if (int Res = cmpNumbers(RMWI->getOperation(), + cast<AtomicRMWInst>(R)->getOperation())) + return Res; + if (int Res = cmpNumbers(RMWI->isVolatile(), + cast<AtomicRMWInst>(R)->isVolatile())) + return Res; + if (int Res = cmpNumbers(RMWI->getOrdering(), + cast<AtomicRMWInst>(R)->getOrdering())) + return Res; + return cmpNumbers(RMWI->getSynchScope(), + cast<AtomicRMWInst>(R)->getSynchScope()); + } + return 0; } // Determine whether two GEP operations perform the same underlying arithmetic. -bool FunctionComparator::isEquivalentGEP(const GEPOperator *GEP1, - const GEPOperator *GEP2) { - unsigned AS = GEP1->getPointerAddressSpace(); - if (AS != GEP2->getPointerAddressSpace()) - return false; - - if (TD) { - // When we have target data, we can reduce the GEP down to the value in bytes - // added to the address. - unsigned BitWidth = TD ? TD->getPointerSizeInBits(AS) : 1; - APInt Offset1(BitWidth, 0), Offset2(BitWidth, 0); - if (GEP1->accumulateConstantOffset(*TD, Offset1) && - GEP2->accumulateConstantOffset(*TD, Offset2)) { - return Offset1 == Offset2; - } +// Read method declaration comments for more details. +int FunctionComparator::cmpGEP(const GEPOperator *GEPL, + const GEPOperator *GEPR) { + + unsigned int ASL = GEPL->getPointerAddressSpace(); + unsigned int ASR = GEPR->getPointerAddressSpace(); + + if (int Res = cmpNumbers(ASL, ASR)) + return Res; + + // When we have target data, we can reduce the GEP down to the value in bytes + // added to the address. + if (DL) { + unsigned BitWidth = DL->getPointerSizeInBits(ASL); + APInt OffsetL(BitWidth, 0), OffsetR(BitWidth, 0); + if (GEPL->accumulateConstantOffset(*DL, OffsetL) && + GEPR->accumulateConstantOffset(*DL, OffsetR)) + return cmpAPInt(OffsetL, OffsetR); } - if (GEP1->getPointerOperand()->getType() != - GEP2->getPointerOperand()->getType()) - return false; + if (int Res = cmpNumbers((uint64_t)GEPL->getPointerOperand()->getType(), + (uint64_t)GEPR->getPointerOperand()->getType())) + return Res; - if (GEP1->getNumOperands() != GEP2->getNumOperands()) - return false; + if (int Res = cmpNumbers(GEPL->getNumOperands(), GEPR->getNumOperands())) + return Res; - for (unsigned i = 0, e = GEP1->getNumOperands(); i != e; ++i) { - if (!enumerate(GEP1->getOperand(i), GEP2->getOperand(i))) - return false; + for (unsigned i = 0, e = GEPL->getNumOperands(); i != e; ++i) { + if (int Res = cmpValues(GEPL->getOperand(i), GEPR->getOperand(i))) + return Res; } - return true; + return 0; } -// Compare two values used by the two functions under pair-wise comparison. If -// this is the first time the values are seen, they're added to the mapping so -// that we will detect mismatches on next use. -bool FunctionComparator::enumerate(const Value *V1, const Value *V2) { - // Check for function @f1 referring to itself and function @f2 referring to - // itself, or referring to each other, or both referring to either of them. - // They're all equivalent if the two functions are otherwise equivalent. - if (V1 == F1 && V2 == F2) - return true; - if (V1 == F2 && V2 == F1) - return true; - - if (const Constant *C1 = dyn_cast<Constant>(V1)) { - if (V1 == V2) return true; - const Constant *C2 = dyn_cast<Constant>(V2); - if (!C2) return false; - // TODO: constant expressions with GEP or references to F1 or F2. - if (C1->isNullValue() && C2->isNullValue() && - isEquivalentType(C1->getType(), C2->getType())) - return true; - // Try bitcasting C2 to C1's type. If the bitcast is legal and returns C1 - // then they must have equal bit patterns. - return C1->getType()->canLosslesslyBitCastTo(C2->getType()) && - C1 == ConstantExpr::getBitCast(const_cast<Constant*>(C2), C1->getType()); - } - - if (isa<InlineAsm>(V1) || isa<InlineAsm>(V2)) - return V1 == V2; - - // Check that V1 maps to V2. If we find a value that V1 maps to then we simply - // check whether it's equal to V2. When there is no mapping then we need to - // ensure that V2 isn't already equivalent to something else. For this - // purpose, we track the V2 values in a set. - - const Value *&map_elem = id_map[V1]; - if (map_elem) - return map_elem == V2; - if (!seen_values.insert(V2).second) - return false; - map_elem = V2; - return true; -} - -// Test whether two basic blocks have equivalent behaviour. -bool FunctionComparator::compare(const BasicBlock *BB1, const BasicBlock *BB2) { - BasicBlock::const_iterator F1I = BB1->begin(), F1E = BB1->end(); - BasicBlock::const_iterator F2I = BB2->begin(), F2E = BB2->end(); +/// Compare two values used by the two functions under pair-wise comparison. If +/// this is the first time the values are seen, they're added to the mapping so +/// that we will detect mismatches on next use. +/// See comments in declaration for more details. +int FunctionComparator::cmpValues(const Value *L, const Value *R) { + // Catch self-reference case. + if (L == FnL) { + if (R == FnR) + return 0; + return -1; + } + if (R == FnR) { + if (L == FnL) + return 0; + return 1; + } - do { - if (!enumerate(F1I, F2I)) - return false; + const Constant *ConstL = dyn_cast<Constant>(L); + const Constant *ConstR = dyn_cast<Constant>(R); + if (ConstL && ConstR) { + if (L == R) + return 0; + return cmpConstants(ConstL, ConstR); + } - if (const GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(F1I)) { - const GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(F2I); - if (!GEP2) - return false; + if (ConstL) + return 1; + if (ConstR) + return -1; - if (!enumerate(GEP1->getPointerOperand(), GEP2->getPointerOperand())) - return false; + const InlineAsm *InlineAsmL = dyn_cast<InlineAsm>(L); + const InlineAsm *InlineAsmR = dyn_cast<InlineAsm>(R); - if (!isEquivalentGEP(GEP1, GEP2)) - return false; - } else { - if (!isEquivalentOperation(F1I, F2I)) - return false; + if (InlineAsmL && InlineAsmR) + return cmpNumbers((uint64_t)L, (uint64_t)R); + if (InlineAsmL) + return 1; + if (InlineAsmR) + return -1; - assert(F1I->getNumOperands() == F2I->getNumOperands()); - for (unsigned i = 0, e = F1I->getNumOperands(); i != e; ++i) { - Value *OpF1 = F1I->getOperand(i); - Value *OpF2 = F2I->getOperand(i); + auto LeftSN = sn_mapL.insert(std::make_pair(L, sn_mapL.size())), + RightSN = sn_mapR.insert(std::make_pair(R, sn_mapR.size())); - if (!enumerate(OpF1, OpF2)) - return false; + return cmpNumbers(LeftSN.first->second, RightSN.first->second); +} +// Test whether two basic blocks have equivalent behaviour. +int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) { + BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end(); + BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end(); - if (OpF1->getValueID() != OpF2->getValueID() || - !isEquivalentType(OpF1->getType(), OpF2->getType())) - return false; + do { + if (int Res = cmpValues(InstL, InstR)) + return Res; + + const GetElementPtrInst *GEPL = dyn_cast<GetElementPtrInst>(InstL); + const GetElementPtrInst *GEPR = dyn_cast<GetElementPtrInst>(InstR); + + if (GEPL && !GEPR) + return 1; + if (GEPR && !GEPL) + return -1; + + if (GEPL && GEPR) { + if (int Res = + cmpValues(GEPL->getPointerOperand(), GEPR->getPointerOperand())) + return Res; + if (int Res = cmpGEP(GEPL, GEPR)) + return Res; + } else { + if (int Res = cmpOperation(InstL, InstR)) + return Res; + assert(InstL->getNumOperands() == InstR->getNumOperands()); + + for (unsigned i = 0, e = InstL->getNumOperands(); i != e; ++i) { + Value *OpL = InstL->getOperand(i); + Value *OpR = InstR->getOperand(i); + if (int Res = cmpValues(OpL, OpR)) + return Res; + if (int Res = cmpNumbers(OpL->getValueID(), OpR->getValueID())) + return Res; + // TODO: Already checked in cmpOperation + if (int Res = cmpType(OpL->getType(), OpR->getType())) + return Res; } } - ++F1I, ++F2I; - } while (F1I != F1E && F2I != F2E); + ++InstL, ++InstR; + } while (InstL != InstLE && InstR != InstRE); - return F1I == F1E && F2I == F2E; + if (InstL != InstLE && InstR == InstRE) + return 1; + if (InstL == InstLE && InstR != InstRE) + return -1; + return 0; } // Test whether the two functions have equivalent behaviour. -bool FunctionComparator::compare() { - // We need to recheck everything, but check the things that weren't included - // in the hash first. +int FunctionComparator::compare() { - if (F1->getAttributes() != F2->getAttributes()) - return false; + sn_mapL.clear(); + sn_mapR.clear(); - if (F1->hasGC() != F2->hasGC()) - return false; + if (int Res = cmpAttrs(FnL->getAttributes(), FnR->getAttributes())) + return Res; - if (F1->hasGC() && F1->getGC() != F2->getGC()) - return false; + if (int Res = cmpNumbers(FnL->hasGC(), FnR->hasGC())) + return Res; - if (F1->hasSection() != F2->hasSection()) - return false; + if (FnL->hasGC()) { + if (int Res = cmpNumbers((uint64_t)FnL->getGC(), (uint64_t)FnR->getGC())) + return Res; + } - if (F1->hasSection() && F1->getSection() != F2->getSection()) - return false; + if (int Res = cmpNumbers(FnL->hasSection(), FnR->hasSection())) + return Res; - if (F1->isVarArg() != F2->isVarArg()) - return false; + if (FnL->hasSection()) { + if (int Res = cmpStrings(FnL->getSection(), FnR->getSection())) + return Res; + } + + if (int Res = cmpNumbers(FnL->isVarArg(), FnR->isVarArg())) + return Res; // TODO: if it's internal and only used in direct calls, we could handle this // case too. - if (F1->getCallingConv() != F2->getCallingConv()) - return false; + if (int Res = cmpNumbers(FnL->getCallingConv(), FnR->getCallingConv())) + return Res; - if (!isEquivalentType(F1->getFunctionType(), F2->getFunctionType())) - return false; + if (int Res = cmpType(FnL->getFunctionType(), FnR->getFunctionType())) + return Res; - assert(F1->arg_size() == F2->arg_size() && + assert(FnL->arg_size() == FnR->arg_size() && "Identically typed functions have different numbers of args!"); // Visit the arguments so that they get enumerated in the order they're // passed in. - for (Function::const_arg_iterator f1i = F1->arg_begin(), - f2i = F2->arg_begin(), f1e = F1->arg_end(); f1i != f1e; ++f1i, ++f2i) { - if (!enumerate(f1i, f2i)) + for (Function::const_arg_iterator ArgLI = FnL->arg_begin(), + ArgRI = FnR->arg_begin(), + ArgLE = FnL->arg_end(); + ArgLI != ArgLE; ++ArgLI, ++ArgRI) { + if (cmpValues(ArgLI, ArgRI) != 0) llvm_unreachable("Arguments repeat!"); } @@ -517,33 +1028,36 @@ bool FunctionComparator::compare() { // linked list is immaterial. Our walk starts at the entry block for both // functions, then takes each block from each terminator in order. As an // artifact, this also means that unreachable blocks are ignored. - SmallVector<const BasicBlock *, 8> F1BBs, F2BBs; + SmallVector<const BasicBlock *, 8> FnLBBs, FnRBBs; SmallSet<const BasicBlock *, 128> VisitedBBs; // in terms of F1. - F1BBs.push_back(&F1->getEntryBlock()); - F2BBs.push_back(&F2->getEntryBlock()); + FnLBBs.push_back(&FnL->getEntryBlock()); + FnRBBs.push_back(&FnR->getEntryBlock()); - VisitedBBs.insert(F1BBs[0]); - while (!F1BBs.empty()) { - const BasicBlock *F1BB = F1BBs.pop_back_val(); - const BasicBlock *F2BB = F2BBs.pop_back_val(); + VisitedBBs.insert(FnLBBs[0]); + while (!FnLBBs.empty()) { + const BasicBlock *BBL = FnLBBs.pop_back_val(); + const BasicBlock *BBR = FnRBBs.pop_back_val(); - if (!enumerate(F1BB, F2BB) || !compare(F1BB, F2BB)) - return false; + if (int Res = cmpValues(BBL, BBR)) + return Res; - const TerminatorInst *F1TI = F1BB->getTerminator(); - const TerminatorInst *F2TI = F2BB->getTerminator(); + if (int Res = compare(BBL, BBR)) + return Res; - assert(F1TI->getNumSuccessors() == F2TI->getNumSuccessors()); - for (unsigned i = 0, e = F1TI->getNumSuccessors(); i != e; ++i) { - if (!VisitedBBs.insert(F1TI->getSuccessor(i))) + const TerminatorInst *TermL = BBL->getTerminator(); + const TerminatorInst *TermR = BBR->getTerminator(); + + assert(TermL->getNumSuccessors() == TermR->getNumSuccessors()); + for (unsigned i = 0, e = TermL->getNumSuccessors(); i != e; ++i) { + if (!VisitedBBs.insert(TermL->getSuccessor(i))) continue; - F1BBs.push_back(F1TI->getSuccessor(i)); - F2BBs.push_back(F2TI->getSuccessor(i)); + FnLBBs.push_back(TermL->getSuccessor(i)); + FnRBBs.push_back(TermR->getSuccessor(i)); } } - return true; + return 0; } namespace { @@ -561,24 +1075,28 @@ public: initializeMergeFunctionsPass(*PassRegistry::getPassRegistry()); } - bool runOnModule(Module &M); + bool runOnModule(Module &M) override; private: - typedef DenseSet<ComparableFunction> FnSetType; + typedef std::set<FunctionPtr> FnTreeType; /// A work queue of functions that may have been modified and should be /// analyzed again. std::vector<WeakVH> Deferred; - /// Insert a ComparableFunction into the FnSet, or merge it away if it's + /// Checks the rules of order relation introduced among functions set. + /// Returns true, if sanity check has been passed, and false if failed. + bool doSanityCheck(std::vector<WeakVH> &Worklist); + + /// Insert a ComparableFunction into the FnTree, or merge it away if it's /// equal to one that's already present. - bool insert(ComparableFunction &NewF); + bool insert(Function *NewFunction); - /// Remove a Function from the FnSet and queue it up for a second sweep of + /// Remove a Function from the FnTree and queue it up for a second sweep of /// analysis. void remove(Function *F); - /// Find the functions that use this Value and remove them from FnSet and + /// Find the functions that use this Value and remove them from FnTree and /// queue the functions. void removeUsers(Value *V); @@ -603,10 +1121,10 @@ private: /// The set of all distinct functions. Use the insert() and remove() methods /// to modify it. - FnSetType FnSet; + FnTreeType FnTree; /// DataLayout for more accurate GEP comparisons. May be NULL. - DataLayout *TD; + const DataLayout *DL; /// Whether or not the target supports global aliases. bool HasGlobalAliases; @@ -621,20 +1139,94 @@ ModulePass *llvm::createMergeFunctionsPass() { return new MergeFunctions(); } +bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { + if (const unsigned Max = NumFunctionsForSanityCheck) { + unsigned TripleNumber = 0; + bool Valid = true; + + dbgs() << "MERGEFUNC-SANITY: Started for first " << Max << " functions.\n"; + + unsigned i = 0; + for (std::vector<WeakVH>::iterator I = Worklist.begin(), E = Worklist.end(); + I != E && i < Max; ++I, ++i) { + unsigned j = i; + for (std::vector<WeakVH>::iterator J = I; J != E && j < Max; ++J, ++j) { + Function *F1 = cast<Function>(*I); + Function *F2 = cast<Function>(*J); + int Res1 = FunctionComparator(DL, F1, F2).compare(); + int Res2 = FunctionComparator(DL, F2, F1).compare(); + + // If F1 <= F2, then F2 >= F1, otherwise report failure. + if (Res1 != -Res2) { + dbgs() << "MERGEFUNC-SANITY: Non-symmetric; triple: " << TripleNumber + << "\n"; + F1->dump(); + F2->dump(); + Valid = false; + } + + if (Res1 == 0) + continue; + + unsigned k = j; + for (std::vector<WeakVH>::iterator K = J; K != E && k < Max; + ++k, ++K, ++TripleNumber) { + if (K == J) + continue; + + Function *F3 = cast<Function>(*K); + int Res3 = FunctionComparator(DL, F1, F3).compare(); + int Res4 = FunctionComparator(DL, F2, F3).compare(); + + bool Transitive = true; + + if (Res1 != 0 && Res1 == Res4) { + // F1 > F2, F2 > F3 => F1 > F3 + Transitive = Res3 == Res1; + } else if (Res3 != 0 && Res3 == -Res4) { + // F1 > F3, F3 > F2 => F1 > F2 + Transitive = Res3 == Res1; + } else if (Res4 != 0 && -Res3 == Res4) { + // F2 > F3, F3 > F1 => F2 > F1 + Transitive = Res4 == -Res1; + } + + if (!Transitive) { + dbgs() << "MERGEFUNC-SANITY: Non-transitive; triple: " + << TripleNumber << "\n"; + dbgs() << "Res1, Res3, Res4: " << Res1 << ", " << Res3 << ", " + << Res4 << "\n"; + F1->dump(); + F2->dump(); + F3->dump(); + Valid = false; + } + } + } + } + + dbgs() << "MERGEFUNC-SANITY: " << (Valid ? "Passed." : "Failed.") << "\n"; + return Valid; + } + return true; +} + bool MergeFunctions::runOnModule(Module &M) { bool Changed = false; - TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) Deferred.push_back(WeakVH(I)); } - FnSet.resize(Deferred.size()); do { std::vector<WeakVH> Worklist; Deferred.swap(Worklist); + DEBUG(doSanityCheck(Worklist)); + DEBUG(dbgs() << "size of module: " << M.size() << '\n'); DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n'); @@ -646,8 +1238,7 @@ bool MergeFunctions::runOnModule(Module &M) { Function *F = cast<Function>(*I); if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage() && !F->mayBeOverridden()) { - ComparableFunction CF = ComparableFunction(F, TD); - Changed |= insert(CF); + Changed |= insert(F); } } @@ -661,49 +1252,27 @@ bool MergeFunctions::runOnModule(Module &M) { Function *F = cast<Function>(*I); if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage() && F->mayBeOverridden()) { - ComparableFunction CF = ComparableFunction(F, TD); - Changed |= insert(CF); + Changed |= insert(F); } } - DEBUG(dbgs() << "size of FnSet: " << FnSet.size() << '\n'); + DEBUG(dbgs() << "size of FnTree: " << FnTree.size() << '\n'); } while (!Deferred.empty()); - FnSet.clear(); + FnTree.clear(); return Changed; } -bool DenseMapInfo<ComparableFunction>::isEqual(const ComparableFunction &LHS, - const ComparableFunction &RHS) { - if (LHS.getFunc() == RHS.getFunc() && - LHS.getHash() == RHS.getHash()) - return true; - if (!LHS.getFunc() || !RHS.getFunc()) - return false; - - // One of these is a special "underlying pointer comparison only" object. - if (LHS.getTD() == ComparableFunction::LookupOnly || - RHS.getTD() == ComparableFunction::LookupOnly) - return false; - - assert(LHS.getTD() == RHS.getTD() && - "Comparing functions for different targets"); - - return FunctionComparator(LHS.getTD(), LHS.getFunc(), - RHS.getFunc()).compare(); -} - // Replace direct callers of Old with New. void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) { Constant *BitcastNew = ConstantExpr::getBitCast(New, Old->getType()); - for (Value::use_iterator UI = Old->use_begin(), UE = Old->use_end(); - UI != UE;) { - Value::use_iterator TheIter = UI; + for (auto UI = Old->use_begin(), UE = Old->use_end(); UI != UE;) { + Use *U = &*UI; ++UI; - CallSite CS(*TheIter); - if (CS && CS.isCallee(TheIter)) { + CallSite CS(U->getUser()); + if (CS && CS.isCallee(U)) { remove(CS.getInstruction()->getParent()->getParent()); - TheIter.getUse().set(BitcastNew); + U->set(BitcastNew); } } } @@ -723,9 +1292,24 @@ void MergeFunctions::writeThunkOrAlias(Function *F, Function *G) { // Helper for writeThunk, // Selects proper bitcast operation, -// but a bit simplier then CastInst::getCastOpcode. -static Value* createCast(IRBuilder<false> &Builder, Value *V, Type *DestTy) { +// but a bit simpler then CastInst::getCastOpcode. +static Value *createCast(IRBuilder<false> &Builder, Value *V, Type *DestTy) { Type *SrcTy = V->getType(); + if (SrcTy->isStructTy()) { + assert(DestTy->isStructTy()); + assert(SrcTy->getStructNumElements() == DestTy->getStructNumElements()); + Value *Result = UndefValue::get(DestTy); + for (unsigned int I = 0, E = SrcTy->getStructNumElements(); I < E; ++I) { + Value *Element = createCast( + Builder, Builder.CreateExtractValue(V, ArrayRef<unsigned int>(I)), + DestTy->getStructElementType(I)); + + Result = + Builder.CreateInsertValue(Result, Element, ArrayRef<unsigned int>(I)); + } + return Result; + } + assert(!DestTy->isStructTy()); if (SrcTy->isIntegerTy() && DestTy->isPointerTy()) return Builder.CreateIntToPtr(V, DestTy); else if (SrcTy->isPointerTy() && DestTy->isIntegerTy()) @@ -784,9 +1368,9 @@ void MergeFunctions::writeThunk(Function *F, Function *G) { // Replace G with an alias to F and delete G. void MergeFunctions::writeAlias(Function *F, Function *G) { - Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType()); - GlobalAlias *GA = new GlobalAlias(G->getType(), G->getLinkage(), "", - BitcastF, G->getParent()); + PointerType *PTy = G->getType(); + auto *GA = GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(), + G->getLinkage(), "", F); F->setAlignment(std::max(F->getAlignment(), G->getAlignment())); GA->takeName(G); GA->setVisibility(G->getVisibility()); @@ -833,54 +1417,57 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) { ++NumFunctionsMerged; } -// Insert a ComparableFunction into the FnSet, or merge it away if equal to one +// Insert a ComparableFunction into the FnTree, or merge it away if equal to one // that was already inserted. -bool MergeFunctions::insert(ComparableFunction &NewF) { - std::pair<FnSetType::iterator, bool> Result = FnSet.insert(NewF); +bool MergeFunctions::insert(Function *NewFunction) { + std::pair<FnTreeType::iterator, bool> Result = + FnTree.insert(FunctionPtr(NewFunction, DL)); + if (Result.second) { - DEBUG(dbgs() << "Inserting as unique: " << NewF.getFunc()->getName() << '\n'); + DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName() << '\n'); return false; } - const ComparableFunction &OldF = *Result.first; + const FunctionPtr &OldF = *Result.first; // Don't merge tiny functions, since it can just end up making the function // larger. // FIXME: Should still merge them if they are unnamed_addr and produce an // alias. - if (NewF.getFunc()->size() == 1) { - if (NewF.getFunc()->front().size() <= 2) { - DEBUG(dbgs() << NewF.getFunc()->getName() - << " is to small to bother merging\n"); + if (NewFunction->size() == 1) { + if (NewFunction->front().size() <= 2) { + DEBUG(dbgs() << NewFunction->getName() + << " is to small to bother merging\n"); return false; } } // Never thunk a strong function to a weak function. - assert(!OldF.getFunc()->mayBeOverridden() || - NewF.getFunc()->mayBeOverridden()); + assert(!OldF.getFunc()->mayBeOverridden() || NewFunction->mayBeOverridden()); - DEBUG(dbgs() << " " << OldF.getFunc()->getName() << " == " - << NewF.getFunc()->getName() << '\n'); + DEBUG(dbgs() << " " << OldF.getFunc()->getName() + << " == " << NewFunction->getName() << '\n'); - Function *DeleteF = NewF.getFunc(); - NewF.release(); + Function *DeleteF = NewFunction; mergeTwoFunctions(OldF.getFunc(), DeleteF); return true; } -// Remove a function from FnSet. If it was already in FnSet, add it to Deferred -// so that we'll look at it in the next round. +// Remove a function from FnTree. If it was already in FnTree, add +// it to Deferred so that we'll look at it in the next round. void MergeFunctions::remove(Function *F) { // We need to make sure we remove F, not a function "equal" to F per the // function equality comparator. - // - // The special "lookup only" ComparableFunction bypasses the expensive - // function comparison in favour of a pointer comparison on the underlying - // Function*'s. - ComparableFunction CF = ComparableFunction(F, ComparableFunction::LookupOnly); - if (FnSet.erase(CF)) { - DEBUG(dbgs() << "Removed " << F->getName() << " from set and deferred it.\n"); + FnTreeType::iterator found = FnTree.find(FunctionPtr(F, DL)); + size_t Erased = 0; + if (found != FnTree.end() && found->getFunc() == F) { + Erased = 1; + FnTree.erase(found); + } + + if (Erased) { + DEBUG(dbgs() << "Removed " << F->getName() + << " from set and deferred it.\n"); Deferred.push_back(F); } } @@ -894,17 +1481,14 @@ void MergeFunctions::removeUsers(Value *V) { Value *V = Worklist.back(); Worklist.pop_back(); - for (Value::use_iterator UI = V->use_begin(), UE = V->use_end(); - UI != UE; ++UI) { - Use &U = UI.getUse(); - if (Instruction *I = dyn_cast<Instruction>(U.getUser())) { + for (User *U : V->users()) { + if (Instruction *I = dyn_cast<Instruction>(U)) { remove(I->getParent()->getParent()); - } else if (isa<GlobalValue>(U.getUser())) { + } else if (isa<GlobalValue>(U)) { // do nothing - } else if (Constant *C = dyn_cast<Constant>(U.getUser())) { - for (Value::use_iterator CUI = C->use_begin(), CUE = C->use_end(); - CUI != CUE; ++CUI) - Worklist.push_back(*CUI); + } else if (Constant *C = dyn_cast<Constant>(U)) { + for (User *UU : C->users()) + Worklist.push_back(UU); } } } diff --git a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp index fa518cb..76d6dfa 100644 --- a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -12,30 +12,31 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "partialinlining" #include "llvm/Transforms/IPO.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" -#include "llvm/Support/CFG.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/CodeExtractor.h" using namespace llvm; +#define DEBUG_TYPE "partialinlining" + STATISTIC(NumPartialInlined, "Number of functions partially inlined"); namespace { struct PartialInliner : public ModulePass { - virtual void getAnalysisUsage(AnalysisUsage &AU) const { } + void getAnalysisUsage(AnalysisUsage &AU) const override { } static char ID; // Pass identification, replacement for typeid PartialInliner() : ModulePass(ID) { initializePartialInlinerPass(*PassRegistry::getPassRegistry()); } - - bool runOnModule(Module& M); - + + bool runOnModule(Module& M) override; + private: Function* unswitchFunction(Function* F); }; @@ -52,10 +53,10 @@ Function* PartialInliner::unswitchFunction(Function* F) { BasicBlock* entryBlock = F->begin(); BranchInst *BR = dyn_cast<BranchInst>(entryBlock->getTerminator()); if (!BR || BR->isUnconditional()) - return 0; + return nullptr; - BasicBlock* returnBlock = 0; - BasicBlock* nonReturnBlock = 0; + BasicBlock* returnBlock = nullptr; + BasicBlock* nonReturnBlock = nullptr; unsigned returnCount = 0; for (succ_iterator SI = succ_begin(entryBlock), SE = succ_end(entryBlock); SI != SE; ++SI) @@ -66,7 +67,7 @@ Function* PartialInliner::unswitchFunction(Function* F) { nonReturnBlock = *SI; if (returnCount != 1) - return 0; + return nullptr; // Clone the function, so that we can hack away on it. ValueToValueMapTy VMap; @@ -119,8 +120,8 @@ Function* PartialInliner::unswitchFunction(Function* F) { // The CodeExtractor needs a dominator tree. DominatorTree DT; - DT.runOnFunction(*duplicateFunction); - + DT.recalculate(*duplicateFunction); + // Extract the body of the if. Function* extractedFunction = CodeExtractor(toExtract, &DT).extractCodeRegion(); @@ -128,8 +129,8 @@ Function* PartialInliner::unswitchFunction(Function* F) { InlineFunctionInfo IFI; // Inline the top-level if test into all callers. - std::vector<User*> Users(duplicateFunction->use_begin(), - duplicateFunction->use_end()); + std::vector<User *> Users(duplicateFunction->user_begin(), + duplicateFunction->user_end()); for (std::vector<User*>::iterator UI = Users.begin(), UE = Users.end(); UI != UE; ++UI) if (CallInst *CI = dyn_cast<CallInst>(*UI)) @@ -162,9 +163,8 @@ bool PartialInliner::runOnModule(Module& M) { if (currFunc->use_empty()) continue; bool recursive = false; - for (Function::use_iterator UI = currFunc->use_begin(), - UE = currFunc->use_end(); UI != UE; ++UI) - if (Instruction* I = dyn_cast<Instruction>(*UI)) + for (User *U : currFunc->users()) + if (Instruction* I = dyn_cast<Instruction>(U)) if (I->getParent()->getParent() == currFunc) { recursive = true; break; diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 24c5018..6d9d8be 100644 --- a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -17,7 +17,7 @@ #include "llvm-c/Transforms/PassManagerBuilder.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/Passes.h" -#include "llvm/Analysis/Verifier.h" +#include "llvm/IR/Verifier.h" #include "llvm/PassManager.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ManagedStatic.h" @@ -33,11 +33,6 @@ RunLoopVectorization("vectorize-loops", cl::Hidden, cl::desc("Run the Loop vectorization passes")); static cl::opt<bool> -LateVectorization("late-vectorize", cl::init(true), cl::Hidden, - cl::desc("Run the vectorization pasess late in the pass " - "pipeline (after the inliner)")); - -static cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::Hidden, cl::desc("Run the SLP vectorization passes")); @@ -58,18 +53,27 @@ static cl::opt<bool> RunLoopRerolling("reroll-loops", cl::Hidden, cl::desc("Run the loop rerolling pass")); +static cl::opt<bool> RunLoadCombine("combine-loads", cl::init(false), + cl::Hidden, + cl::desc("Run the load combining pass")); + +static cl::opt<bool> RunGVN("enable-gvn", cl::init(true), + cl::Hidden, + cl::desc("Run the global value numbering pass")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; - LibraryInfo = 0; - Inliner = 0; + LibraryInfo = nullptr; + Inliner = nullptr; + DisableTailCalls = false; DisableUnitAtATime = false; DisableUnrollLoops = false; BBVectorize = RunBBVectorization; SLPVectorize = RunSLPVectorization; LoopVectorize = RunLoopVectorization; - LateVectorize = LateVectorization; RerollLoops = RunLoopRerolling; + LoadCombine = RunLoadCombine; } PassManagerBuilder::~PassManagerBuilder() { @@ -134,7 +138,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { if (OptLevel == 0) { if (Inliner) { MPM.add(Inliner); - Inliner = 0; + Inliner = nullptr; } // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC @@ -156,12 +160,13 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { if (!DisableUnitAtATime) { addExtensionsToPM(EP_ModuleOptimizerEarly, MPM); + MPM.add(createIPSCCPPass()); // IP SCCP MPM.add(createGlobalOptimizerPass()); // Optimize out global vars - MPM.add(createIPSCCPPass()); // IP SCCP MPM.add(createDeadArgEliminationPass()); // Dead argument elimination MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE + addExtensionsToPM(EP_Peephole, MPM); MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE } @@ -170,7 +175,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createPruneEHPass()); // Remove dead EH info if (Inliner) { MPM.add(Inliner); - Inliner = 0; + Inliner = nullptr; } if (!DisableUnitAtATime) MPM.add(createFunctionAttrsPass()); // Set readonly/readnone attrs @@ -188,11 +193,14 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createInstructionCombiningPass()); // Combine silly seq's + addExtensionsToPM(EP_Peephole, MPM); - MPM.add(createTailCallEliminationPass()); // Eliminate tail calls + if (!DisableTailCalls) + MPM.add(createTailCallEliminationPass()); // Eliminate tail calls MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createReassociatePass()); // Reassociate expressions - MPM.add(createLoopRotatePass()); // Rotate Loop + // Rotate Loop - disable header duplication at -Oz + MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); MPM.add(createLICMPass()); // Hoist loop invariants MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3)); MPM.add(createInstructionCombiningPass()); @@ -200,21 +208,22 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. MPM.add(createLoopDeletionPass()); // Delete dead loops - if (!LateVectorize && LoopVectorize) - MPM.add(createLoopVectorizePass(DisableUnrollLoops)); - if (!DisableUnrollLoops) - MPM.add(createLoopUnrollPass()); // Unroll small loops + MPM.add(createSimpleLoopUnrollPass()); // Unroll small loops addExtensionsToPM(EP_LoopOptimizerEnd, MPM); - if (OptLevel > 1) - MPM.add(createGVNPass()); // Remove redundancies + if (OptLevel > 1) { + MPM.add(createMergedLoadStoreMotionPass()); // Merge load/stores in diamond + if (RunGVN) + MPM.add(createGVNPass()); // Remove redundancies + } MPM.add(createMemCpyOptPass()); // Remove memcpy / form memset MPM.add(createSCCPPass()); // Constant prop with SCCP // Run instcombine after redundancy elimination to exploit opportunities // opened up by them. MPM.add(createInstructionCombiningPass()); + addExtensionsToPM(EP_Peephole, MPM); MPM.add(createJumpThreadingPass()); // Thread jumps MPM.add(createCorrelatedValuePropagationPass()); MPM.add(createDeadStoreEliminationPass()); // Delete dead stores @@ -229,6 +238,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { if (BBVectorize) { MPM.add(createBBVectorizePass()); MPM.add(createInstructionCombiningPass()); + addExtensionsToPM(EP_Peephole, MPM); if (OptLevel > 1 && UseGVNAfterVectorization) MPM.add(createGVNPass()); // Remove redundancies else @@ -239,25 +249,30 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createLoopUnrollPass()); } + if (LoadCombine) + MPM.add(createLoadCombinePass()); + MPM.add(createAggressiveDCEPass()); // Delete dead instructions MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createInstructionCombiningPass()); // Clean up after everything. + addExtensionsToPM(EP_Peephole, MPM); + + // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC + // pass manager that we are specifically trying to avoid. To prevent this + // we must insert a no-op module pass to reset the pass manager. + MPM.add(createBarrierNoopPass()); + MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); + // FIXME: Because of #pragma vectorize enable, the passes below are always + // inserted in the pipeline, even when the vectorizer doesn't run (ex. when + // on -O1 and no #pragma is found). Would be good to have these two passes + // as function calls, so that we can only pass them when the vectorizer + // changed the code. + MPM.add(createInstructionCombiningPass()); + addExtensionsToPM(EP_Peephole, MPM); + MPM.add(createCFGSimplificationPass()); - // As an experimental mode, run any vectorization passes in a separate - // pipeline from the CGSCC pass manager that runs iteratively with the - // inliner. - if (LateVectorize && LoopVectorize) { - // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC - // pass manager that we are specifically trying to avoid. To prevent this - // we must insert a no-op module pass to reset the pass manager. - MPM.add(createBarrierNoopPass()); - - // Add the various vectorization passes and relevant cleanup passes for - // them since we are no longer in the middle of the main scalar pipeline. - MPM.add(createLoopVectorizePass(DisableUnrollLoops)); - MPM.add(createInstructionCombiningPass()); - MPM.add(createCFGSimplificationPass()); - } + if (!DisableUnrollLoops) + MPM.add(createLoopUnrollPass()); // Unroll small loops if (!DisableUnitAtATime) { // FIXME: We shouldn't bother with this anymore. @@ -306,6 +321,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, // function pointers. When this happens, we often have to resolve varargs // calls, etc, so let instcombine do this. PM.add(createInstructionCombiningPass()); + addExtensionsToPM(EP_Peephole, PM); // Inline small functions if (RunInliner) @@ -324,6 +340,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, // The IPO passes may leave cruft around. Clean up after them. PM.add(createInstructionCombiningPass()); + addExtensionsToPM(EP_Peephole, PM); PM.add(createJumpThreadingPass()); // Break up allocas @@ -337,14 +354,27 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, PM.add(createGlobalsModRefPass()); // IP alias analysis. PM.add(createLICMPass()); // Hoist loop invariants. + PM.add(createMergedLoadStoreMotionPass()); // Merge load/stores in diamonds PM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies. PM.add(createMemCpyOptPass()); // Remove dead memcpys. // Nuke dead stores. PM.add(createDeadStoreEliminationPass()); + // More loops are countable; try to optimize them. + PM.add(createIndVarSimplifyPass()); + PM.add(createLoopDeletionPass()); + PM.add(createLoopVectorizePass(true, true)); + + // More scalar chains could be vectorized due to more alias information + PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. + + if (LoadCombine) + PM.add(createLoadCombinePass()); + // Cleanup and simplify the code after the scalar optimizations. PM.add(createInstructionCombiningPass()); + addExtensionsToPM(EP_Peephole, PM); PM.add(createJumpThreadingPass()); diff --git a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp index b160913..b2c4a09 100644 --- a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp @@ -14,22 +14,23 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "prune-eh" #include "llvm/Transforms/IPO.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/Support/CFG.h" #include <algorithm> using namespace llvm; +#define DEBUG_TYPE "prune-eh" + STATISTIC(NumRemoved, "Number of invokes removed"); STATISTIC(NumUnreach, "Number of noreturn calls optimized"); @@ -41,7 +42,7 @@ namespace { } // runOnSCC - Analyze the SCC, performing the transformation if possible. - bool runOnSCC(CallGraphSCC &SCC); + bool runOnSCC(CallGraphSCC &SCC) override; bool SimplifyFunction(Function *F); void DeleteBasicBlock(BasicBlock *BB); @@ -51,7 +52,7 @@ namespace { char PruneEH::ID = 0; INITIALIZE_PASS_BEGIN(PruneEH, "prune-eh", "Remove unused exception handling info", false, false) -INITIALIZE_PASS_DEPENDENCY(CallGraph) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) INITIALIZE_PASS_END(PruneEH, "prune-eh", "Remove unused exception handling info", false, false) @@ -60,7 +61,7 @@ Pass *llvm::createPruneEHPass() { return new PruneEH(); } bool PruneEH::runOnSCC(CallGraphSCC &SCC) { SmallPtrSet<CallGraphNode *, 8> SCCNodes; - CallGraph &CG = getAnalysis<CallGraph>(); + CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); bool MadeChange = false; // Fill SCCNodes with the elements of the SCC. Used for quickly @@ -85,7 +86,7 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) { for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); (!SCCMightUnwind || !SCCMightReturn) && I != E; ++I) { Function *F = (*I)->getFunction(); - if (F == 0) { + if (!F) { SCCMightUnwind = true; SCCMightReturn = true; } else if (F->isDeclaration() || F->mayBeOverridden()) { @@ -234,7 +235,7 @@ bool PruneEH::SimplifyFunction(Function *F) { /// exist in the BB. void PruneEH::DeleteBasicBlock(BasicBlock *BB) { assert(pred_begin(BB) == pred_end(BB) && "BB is not dead!"); - CallGraph &CG = getAnalysis<CallGraph>(); + CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); CallGraphNode *CGN = CG[BB->getParent()]; for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; ) { diff --git a/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp index f00830a..956991a 100644 --- a/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp +++ b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp @@ -14,13 +14,14 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "strip-dead-prototypes" #include "llvm/Transforms/IPO.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" using namespace llvm; +#define DEBUG_TYPE "strip-dead-prototypes" + STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed"); namespace { @@ -32,7 +33,7 @@ public: StripDeadPrototypesPass() : ModulePass(ID) { initializeStripDeadPrototypesPassPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnModule(Module &M); + bool runOnModule(Module &M) override; }; } // end anonymous namespace diff --git a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp index c4f5cfc..1abbccc 100644 --- a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp +++ b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp @@ -23,8 +23,8 @@ #include "llvm/Transforms/IPO.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/DebugInfo.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -44,9 +44,9 @@ namespace { initializeStripSymbolsPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnModule(Module &M); + bool runOnModule(Module &M) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); } }; @@ -59,9 +59,9 @@ namespace { initializeStripNonDebugSymbolsPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnModule(Module &M); + bool runOnModule(Module &M) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); } }; @@ -74,9 +74,9 @@ namespace { initializeStripDebugDeclarePass(*PassRegistry::getPassRegistry()); } - virtual bool runOnModule(Module &M); + bool runOnModule(Module &M) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); } }; @@ -89,9 +89,9 @@ namespace { initializeStripDeadDebugInfoPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnModule(Module &M); + bool runOnModule(Module &M) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); } }; @@ -132,11 +132,10 @@ ModulePass *llvm::createStripDeadDebugInfoPass() { /// OnlyUsedBy - Return true if V is only used by Usr. static bool OnlyUsedBy(Value *V, Value *Usr) { - for(Value::use_iterator I = V->use_begin(), E = V->use_end(); I != E; ++I) { - User *U = *I; + for (User *U : V->users()) if (U != Usr) return false; - } + return true; } @@ -147,7 +146,7 @@ static void RemoveDeadConstant(Constant *C) { if (OnlyUsedBy(C->getOperand(i), C)) Operands.insert(cast<Constant>(C->getOperand(i))); if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) { - if (!GV->hasLocalLinkage()) return; // Don't delete non static globals. + if (!GV->hasLocalLinkage()) return; // Don't delete non-static globals. GV->eraseFromParent(); } else if (!isa<Function>(C)) @@ -193,7 +192,7 @@ static void StripTypeNames(Module &M, bool PreserveDbgInfo) { /// Find values that are marked as llvm.used. static void findUsedValues(GlobalVariable *LLVMUsed, SmallPtrSet<const GlobalValue*, 8> &UsedValues) { - if (LLVMUsed == 0) return; + if (!LLVMUsed) return; UsedValues.insert(LLVMUsed); ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer()); @@ -250,7 +249,7 @@ bool StripDebugDeclare::runOnModule(Module &M) { if (Declare) { while (!Declare->use_empty()) { - CallInst *CI = cast<CallInst>(Declare->use_back()); + CallInst *CI = cast<CallInst>(Declare->user_back()); Value *Arg1 = CI->getArgOperand(0); Value *Arg2 = CI->getArgOperand(1); assert(CI->use_empty() && "llvm.dbg intrinsic should have void result"); @@ -307,10 +306,7 @@ bool StripDeadDebugInfo::runOnModule(Module &M) { SmallVector<Value *, 64> LiveSubprograms; DenseSet<const MDNode *> VisitedSet; - for (DebugInfoFinder::iterator CI = F.compile_unit_begin(), - CE = F.compile_unit_end(); CI != CE; ++CI) { - // Create our compile unit. - DICompileUnit DIC(*CI); + for (DICompileUnit DIC : F.compile_units()) { assert(DIC.Verify() && "DIC must verify as a DICompileUnit."); // Create our live subprogram list. diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h index a5eddc2..ab4dc1c 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h @@ -11,46 +11,59 @@ #define INSTCOMBINE_INSTCOMBINE_H #include "InstCombineWorklist.h" +#include "llvm/Analysis/TargetFolder.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" -#include "llvm/InstVisitor.h" #include "llvm/Pass.h" -#include "llvm/Support/TargetFolder.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" +#define DEBUG_TYPE "instcombine" + namespace llvm { - class CallSite; - class DataLayout; - class TargetLibraryInfo; - class DbgDeclareInst; - class MemIntrinsic; - class MemSetInst; +class CallSite; +class DataLayout; +class TargetLibraryInfo; +class DbgDeclareInst; +class MemIntrinsic; +class MemSetInst; /// SelectPatternFlavor - We can match a variety of different patterns for /// select operations. enum SelectPatternFlavor { SPF_UNKNOWN = 0, - SPF_SMIN, SPF_UMIN, - SPF_SMAX, SPF_UMAX - //SPF_ABS - TODO. + SPF_SMIN, + SPF_UMIN, + SPF_SMAX, + SPF_UMAX, + SPF_ABS, + SPF_NABS }; /// getComplexity: Assign a complexity or rank value to LLVM Values... /// 0 -> undef, 1 -> Const, 2 -> Other, 3 -> Arg, 3 -> Unary, 4 -> OtherInst static inline unsigned getComplexity(Value *V) { if (isa<Instruction>(V)) { - if (BinaryOperator::isNeg(V) || - BinaryOperator::isFNeg(V) || + if (BinaryOperator::isNeg(V) || BinaryOperator::isFNeg(V) || BinaryOperator::isNot(V)) return 3; return 4; } - if (isa<Argument>(V)) return 3; + if (isa<Argument>(V)) + return 3; return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2; } +/// AddOne - Add one to a Constant +static inline Constant *AddOne(Constant *C) { + return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1)); +} +/// SubOne - Subtract one from a Constant +static inline Constant *SubOne(Constant *C) { + return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1)); +} /// InstCombineIRInserter - This is an IRBuilder insertion helper that works /// just like the normal insertion helper, but also adds any new instructions @@ -58,11 +71,12 @@ static inline unsigned getComplexity(Value *V) { class LLVM_LIBRARY_VISIBILITY InstCombineIRInserter : public IRBuilderDefaultInserter<true> { InstCombineWorklist &Worklist; + public: InstCombineIRInserter(InstCombineWorklist &WL) : Worklist(WL) {} - void InsertHelper(Instruction *I, const Twine &Name, - BasicBlock *BB, BasicBlock::iterator InsertPt) const { + void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB, + BasicBlock::iterator InsertPt) const { IRBuilderDefaultInserter<true>::InsertHelper(I, Name, BB, InsertPt); Worklist.Add(I); } @@ -70,13 +84,14 @@ public: /// InstCombiner - The -instcombine pass. class LLVM_LIBRARY_VISIBILITY InstCombiner - : public FunctionPass, - public InstVisitor<InstCombiner, Instruction*> { - DataLayout *TD; + : public FunctionPass, + public InstVisitor<InstCombiner, Instruction *> { + const DataLayout *DL; TargetLibraryInfo *TLI; bool MadeIRChange; LibCallSimplifier *Simplifier; bool MinimizeSize; + public: /// Worklist - All of the instructions that need to be simplified. InstCombineWorklist Worklist; @@ -87,19 +102,19 @@ public: BuilderTy *Builder; static char ID; // Pass identification, replacement for typeid - InstCombiner() : FunctionPass(ID), TD(0), Builder(0) { + InstCombiner() : FunctionPass(ID), DL(nullptr), Builder(nullptr) { MinimizeSize = false; initializeInstCombinerPass(*PassRegistry::getPassRegistry()); } public: - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; bool DoOneIteration(Function &F, unsigned ItNum); - virtual void getAnalysisUsage(AnalysisUsage &AU) const; + void getAnalysisUsage(AnalysisUsage &AU) const override; - DataLayout *getDataLayout() const { return TD; } + const DataLayout *getDataLayout() const { return DL; } TargetLibraryInfo *getTargetLibraryInfo() const { return TLI; } @@ -116,7 +131,7 @@ public: Instruction *visitSub(BinaryOperator &I); Instruction *visitFSub(BinaryOperator &I); Instruction *visitMul(BinaryOperator &I); - Value *foldFMulConst(Instruction *FMulOrDiv, ConstantFP *C, + Value *foldFMulConst(Instruction *FMulOrDiv, Constant *C, Instruction *InsertBefore); Instruction *visitFMul(BinaryOperator &I); Instruction *visitURem(BinaryOperator &I); @@ -135,9 +150,9 @@ public: Instruction *visitAnd(BinaryOperator &I); Value *FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS); Value *FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS); - Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op, - Value *A, Value *B, Value *C); - Instruction *visitOr (BinaryOperator &I); + Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op, Value *A, + Value *B, Value *C); + Instruction *visitOr(BinaryOperator &I); Instruction *visitXor(BinaryOperator &I); Instruction *visitShl(BinaryOperator &I); Instruction *visitAShr(BinaryOperator &I); @@ -147,12 +162,11 @@ public: Constant *RHSC); Instruction *FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, CmpInst &ICI, - ConstantInt *AndCst = 0); + ConstantInt *AndCst = nullptr); Instruction *visitFCmpInst(FCmpInst &I); Instruction *visitICmpInst(ICmpInst &I); Instruction *visitICmpInstWithCastAndCast(ICmpInst &ICI); - Instruction *visitICmpInstWithInstAndIntCst(ICmpInst &ICI, - Instruction *LHS, + Instruction *visitICmpInstWithInstAndIntCst(ICmpInst &ICI, Instruction *LHS, ConstantInt *RHS); Instruction *FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI, ConstantInt *DivRHS); @@ -162,7 +176,7 @@ public: ICmpInst::Predicate Pred); Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, ICmpInst::Predicate Cond, Instruction &I); - Instruction *FoldShiftByConstant(Value *Op0, ConstantInt *Op1, + Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1, BinaryOperator &I); Instruction *commonCastTransforms(CastInst &CI); Instruction *commonPointerCastTransforms(CastInst &CI); @@ -179,9 +193,8 @@ public: Instruction *visitIntToPtr(IntToPtrInst &CI); Instruction *visitBitCast(BitCastInst &CI); Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI); - Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI, - Instruction *FI); - Instruction *FoldSelectIntoOp(SelectInst &SI, Value*, Value*); + Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI); + Instruction *FoldSelectIntoOp(SelectInst &SI, Value *, Value *); Instruction *FoldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1, Value *A, Value *B, Instruction &Outer, SelectPatternFlavor SPF2, Value *C); @@ -200,6 +213,7 @@ public: Instruction *visitStoreInst(StoreInst &SI); Instruction *visitBranchInst(BranchInst &BI); Instruction *visitSwitchInst(SwitchInst &SI); + Instruction *visitInsertValueInst(InsertValueInst &IV); Instruction *visitInsertElementInst(InsertElementInst &IE); Instruction *visitExtractElementInst(ExtractElementInst &EI); Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI); @@ -207,25 +221,25 @@ public: Instruction *visitLandingPadInst(LandingPadInst &LI); // visitInstruction - Specify what to return for unhandled instructions... - Instruction *visitInstruction(Instruction &I) { return 0; } + Instruction *visitInstruction(Instruction &I) { return nullptr; } private: bool ShouldChangeType(Type *From, Type *To) const; Value *dyn_castNegVal(Value *V) const; - Value *dyn_castFNegVal(Value *V, bool NoSignedZero=false) const; + Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const; Type *FindElementAtOffset(Type *PtrTy, int64_t Offset, - SmallVectorImpl<Value*> &NewIndices); + SmallVectorImpl<Value *> &NewIndices); Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI); /// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually /// results in any code being generated and is interesting to optimize out. If /// the cast can be eliminated by some other simple transformation, we prefer /// to do the simplification first. - bool ShouldOptimizeCast(Instruction::CastOps opcode,const Value *V, + bool ShouldOptimizeCast(Instruction::CastOps opcode, const Value *V, Type *Ty); Instruction *visitCallSite(CallSite CS); - Instruction *tryOptimizeCall(CallInst *CI, const DataLayout *TD); + Instruction *tryOptimizeCall(CallInst *CI, const DataLayout *DL); bool transformConstExprCastCall(CallSite CS); Instruction *transformCallThroughTrampoline(CallSite CS, IntrinsicInst *Tramp); @@ -233,6 +247,7 @@ private: bool DoXform = true); Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI); bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS); + bool WillNotOverflowUnsignedAdd(Value *LHS, Value *RHS); Value *EmitGEPOffset(User *GEP); Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN); Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask); @@ -242,10 +257,10 @@ public: // in the program. Add the new instruction to the worklist. // Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) { - assert(New && New->getParent() == 0 && + assert(New && !New->getParent() && "New instruction already inserted into a basic block!"); BasicBlock *BB = Old.getParent(); - BB->getInstList().insert(&Old, New); // Insert inst + BB->getInstList().insert(&Old, New); // Insert inst Worklist.Add(New); return New; } @@ -265,7 +280,7 @@ public: // modified. // Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) { - Worklist.AddUsersToWorkList(I); // Add all modified instrs to worklist. + Worklist.AddUsersToWorkList(I); // Add all modified instrs to worklist. // If we are replacing the instruction with itself, this must be in a // segment of unreachable code, so just clobber the instruction. @@ -297,24 +312,23 @@ public: Worklist.Remove(&I); I.eraseFromParent(); MadeIRChange = true; - return 0; // Don't do anything with FI + return nullptr; // Don't do anything with FI } - void ComputeMaskedBits(Value *V, APInt &KnownZero, - APInt &KnownOne, unsigned Depth = 0) const { - return llvm::ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth); + void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne, + unsigned Depth = 0) const { + return llvm::computeKnownBits(V, KnownZero, KnownOne, DL, Depth); } bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth = 0) const { - return llvm::MaskedValueIsZero(V, Mask, TD, Depth); + return llvm::MaskedValueIsZero(V, Mask, DL, Depth); } unsigned ComputeNumSignBits(Value *Op, unsigned Depth = 0) const { - return llvm::ComputeNumSignBits(Op, TD, Depth); + return llvm::ComputeNumSignBits(Op, DL, Depth); } private: - /// SimplifyAssociativeOrCommutative - This performs a few simplifications for /// operators which are associative or commutative. bool SimplifyAssociativeOrCommutative(BinaryOperator &I); @@ -328,12 +342,10 @@ private: /// SimplifyDemandedUseBits - Attempts to replace V with a simpler value /// based on the demanded bits. - Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, - APInt& KnownZero, APInt& KnownOne, - unsigned Depth); - bool SimplifyDemandedBits(Use &U, APInt DemandedMask, - APInt& KnownZero, APInt& KnownOne, - unsigned Depth=0); + Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, APInt &KnownZero, + APInt &KnownOne, unsigned Depth); + bool SimplifyDemandedBits(Use &U, APInt DemandedMask, APInt &KnownZero, + APInt &KnownOne, unsigned Depth = 0); /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence. Value *SimplifyShrShlDemandedBits(Instruction *Lsr, Instruction *Sftl, @@ -346,7 +358,9 @@ private: bool SimplifyDemandedInstructionBits(Instruction &Inst); Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, - APInt& UndefElts, unsigned Depth = 0); + APInt &UndefElts, unsigned Depth = 0); + + Value *SimplifyVectorOp(BinaryOperator &Inst); // FoldOpIntoPhi - Given a binary operator, cast instruction, or select // which has a PHI node as operand #0, see if we can fold the instruction @@ -363,21 +377,19 @@ private: Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN); Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN); - Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS, ConstantInt *AndRHS, BinaryOperator &TheAnd); Value *FoldLogicalPlusAnd(Value *LHS, Value *RHS, ConstantInt *Mask, bool isSub, Instruction &I); - Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, - bool isSigned, bool Inside); + Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, bool isSigned, + bool Inside); Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI); Instruction *MatchBSwap(BinaryOperator &I); bool SimplifyStoreAtEndOfBlock(StoreInst &SI); Instruction *SimplifyMemTransfer(MemIntrinsic *MI); Instruction *SimplifyMemSet(MemSetInst *MI); - Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned); /// Descale - Return a value X such that Val = X * Scale, or null if none. If @@ -385,8 +397,8 @@ private: Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap); }; - - } // end namespace llvm. +#undef DEBUG_TYPE + #endif diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 534feb8..e80d6a9 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -15,11 +15,13 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/DataLayout.h" -#include "llvm/Support/GetElementPtrTypeIterator.h" -#include "llvm/Support/PatternMatch.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/PatternMatch.h" using namespace llvm; using namespace PatternMatch; +#define DEBUG_TYPE "instcombine" + namespace { /// Class representing coefficient of floating-point addend. @@ -30,7 +32,7 @@ namespace { /// class FAddendCoef { public: - // The constructor has to initialize a APFloat, which is uncessary for + // The constructor has to initialize a APFloat, which is unnecessary for // most addends which have coefficient either 1 or -1. So, the constructor // is expensive. In order to avoid the cost of the constructor, we should // reuse some instances whenever possible. The pre-created instances @@ -112,12 +114,12 @@ namespace { /// class FAddend { public: - FAddend() { Val = 0; } + FAddend() { Val = nullptr; } Value *getSymVal (void) const { return Val; } const FAddendCoef &getCoef(void) const { return Coeff; } - bool isConstant() const { return Val == 0; } + bool isConstant() const { return Val == nullptr; } bool isZero() const { return Coeff.isZero(); } void set(short Coefficient, Value *V) { Coeff.set(Coefficient), Val = V; } @@ -154,7 +156,7 @@ namespace { /// class FAddCombine { public: - FAddCombine(InstCombiner::BuilderTy *B) : Builder(B), Instr(0) {} + FAddCombine(InstCombiner::BuilderTy *B) : Builder(B), Instr(nullptr) {} Value *simplify(Instruction *FAdd); private: @@ -175,7 +177,7 @@ namespace { Value *createFDiv(Value *Opnd0, Value *Opnd1); Value *createFNeg(Value *V); Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota); - void createInstPostProc(Instruction *NewInst); + void createInstPostProc(Instruction *NewInst, bool NoNumber = false); InstCombiner::BuilderTy *Builder; Instruction *Instr; @@ -348,8 +350,8 @@ Value *FAddendCoef::getValue(Type *Ty) const { // unsigned FAddend::drillValueDownOneStep (Value *Val, FAddend &Addend0, FAddend &Addend1) { - Instruction *I = 0; - if (Val == 0 || !(I = dyn_cast<Instruction>(Val))) + Instruction *I = nullptr; + if (!Val || !(I = dyn_cast<Instruction>(Val))) return 0; unsigned Opcode = I->getOpcode(); @@ -359,16 +361,16 @@ unsigned FAddend::drillValueDownOneStep Value *Opnd0 = I->getOperand(0); Value *Opnd1 = I->getOperand(1); if ((C0 = dyn_cast<ConstantFP>(Opnd0)) && C0->isZero()) - Opnd0 = 0; + Opnd0 = nullptr; if ((C1 = dyn_cast<ConstantFP>(Opnd1)) && C1->isZero()) - Opnd1 = 0; + Opnd1 = nullptr; if (Opnd0) { if (!C0) Addend0.set(1, Opnd0); else - Addend0.set(C0, 0); + Addend0.set(C0, nullptr); } if (Opnd1) { @@ -376,7 +378,7 @@ unsigned FAddend::drillValueDownOneStep if (!C1) Addend.set(1, Opnd1); else - Addend.set(C1, 0); + Addend.set(C1, nullptr); if (Opcode == Instruction::FSub) Addend.negate(); } @@ -385,7 +387,7 @@ unsigned FAddend::drillValueDownOneStep return Opnd0 && Opnd1 ? 2 : 1; // Both operands are zero. Weird! - Addend0.set(APFloat(C0->getValueAPF().getSemantics()), 0); + Addend0.set(APFloat(C0->getValueAPF().getSemantics()), nullptr); return 1; } @@ -443,13 +445,13 @@ Value *FAddCombine::performFactorization(Instruction *I) { Instruction *I1 = dyn_cast<Instruction>(I->getOperand(1)); if (!I0 || !I1 || I0->getOpcode() != I1->getOpcode()) - return 0; + return nullptr; bool isMpy = false; if (I0->getOpcode() == Instruction::FMul) isMpy = true; else if (I0->getOpcode() != Instruction::FDiv) - return 0; + return nullptr; Value *Opnd0_0 = I0->getOperand(0); Value *Opnd0_1 = I0->getOperand(1); @@ -461,8 +463,8 @@ Value *FAddCombine::performFactorization(Instruction *I) { // (x*y) +/- (x*z) x y z // (y/x) +/- (z/x) x y z // - Value *Factor = 0; - Value *AddSub0 = 0, *AddSub1 = 0; + Value *Factor = nullptr; + Value *AddSub0 = nullptr, *AddSub1 = nullptr; if (isMpy) { if (Opnd0_0 == Opnd1_0 || Opnd0_0 == Opnd1_1) @@ -481,7 +483,12 @@ Value *FAddCombine::performFactorization(Instruction *I) { } if (!Factor) - return 0; + return nullptr; + + FastMathFlags Flags; + Flags.setUnsafeAlgebra(); + if (I0) Flags &= I->getFastMathFlags(); + if (I1) Flags &= I->getFastMathFlags(); // Create expression "NewAddSub = AddSub0 +/- AddsSub1" Value *NewAddSub = (I->getOpcode() == Instruction::FAdd) ? @@ -490,13 +497,21 @@ Value *FAddCombine::performFactorization(Instruction *I) { if (ConstantFP *CFP = dyn_cast<ConstantFP>(NewAddSub)) { const APFloat &F = CFP->getValueAPF(); if (!F.isNormal()) - return 0; - } + return nullptr; + } else if (Instruction *II = dyn_cast<Instruction>(NewAddSub)) + II->setFastMathFlags(Flags); - if (isMpy) - return createFMul(Factor, NewAddSub); + if (isMpy) { + Value *RI = createFMul(Factor, NewAddSub); + if (Instruction *II = dyn_cast<Instruction>(RI)) + II->setFastMathFlags(Flags); + return RI; + } - return createFDiv(NewAddSub, Factor); + Value *RI = createFDiv(NewAddSub, Factor); + if (Instruction *II = dyn_cast<Instruction>(RI)) + II->setFastMathFlags(Flags); + return RI; } Value *FAddCombine::simplify(Instruction *I) { @@ -504,7 +519,7 @@ Value *FAddCombine::simplify(Instruction *I) { // Currently we are not able to handle vector type. if (I->getType()->isVectorTy()) - return 0; + return nullptr; assert((I->getOpcode() == Instruction::FAdd || I->getOpcode() == Instruction::FSub) && "Expect add/sub"); @@ -555,7 +570,7 @@ Value *FAddCombine::simplify(Instruction *I) { // been optimized into "I = Y - X" in the previous steps. // const FAddendCoef &CE = Opnd0.getCoef(); - return CE.isOne() ? Opnd0.getSymVal() : 0; + return CE.isOne() ? Opnd0.getSymVal() : nullptr; } // step 4: Try to optimize Opnd0 + Opnd1_0 [+ Opnd1_1] @@ -601,7 +616,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { // constant close to supper-expr(s) will potentially reveal some optimization // opportunities in super-expr(s). // - const FAddend *ConstAdd = 0; + const FAddend *ConstAdd = nullptr; // Simplified addends are placed <SimpVect>. AddendVect SimpVect; @@ -634,7 +649,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { if (T && T->getSymVal() == Val) { // Set null such that next iteration of the outer loop will not process // this addend again. - Addends[SameSymIdx] = 0; + Addends[SameSymIdx] = nullptr; SimpVect.push_back(T); } } @@ -648,7 +663,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { // Pop all addends being folded and push the resulting folded addend. SimpVect.resize(StartIdx); - if (Val != 0) { + if (Val) { if (!R.isZero()) { SimpVect.push_back(&R); } @@ -685,7 +700,7 @@ Value *FAddCombine::createNaryFAdd // unsigned InstrNeeded = calcInstrNumber(Opnds); if (InstrNeeded > InstrQuota) - return 0; + return nullptr; initCreateInstNum(); @@ -697,7 +712,7 @@ Value *FAddCombine::createNaryFAdd // N-ary addition has at most two instructions, and we don't need to worry // about tree-height when constructing the N-ary addition. - Value *LastVal = 0; + Value *LastVal = nullptr; bool LastValNeedNeg = false; // Iterate the addends, creating fadd/fsub using adjacent two addends. @@ -746,7 +761,10 @@ Value *FAddCombine::createFSub Value *FAddCombine::createFNeg(Value *V) { Value *Zero = cast<Value>(ConstantFP::get(V->getType(), 0.0)); - return createFSub(Zero, V); + Value *NewV = createFSub(Zero, V); + if (Instruction *I = dyn_cast<Instruction>(NewV)) + createInstPostProc(I, true); // fneg's don't receive instruction numbers. + return NewV; } Value *FAddCombine::createFAdd @@ -771,11 +789,13 @@ Value *FAddCombine::createFDiv(Value *Opnd0, Value *Opnd1) { return V; } -void FAddCombine::createInstPostProc(Instruction *NewInstr) { +void FAddCombine::createInstPostProc(Instruction *NewInstr, + bool NoNumber) { NewInstr->setDebugLoc(Instr->getDebugLoc()); // Keep track of the number of instruction created. - incCreateInstNum(); + if (!NoNumber) + incCreateInstNum(); // Propagate fast-math flags NewInstr->setFastMathFlags(Instr->getFastMathFlags()); @@ -845,80 +865,170 @@ Value *FAddCombine::createAddendVal return createFMul(OpndVal, Coeff.getValue(Instr->getType())); } -/// AddOne - Add one to a ConstantInt. -static Constant *AddOne(Constant *C) { - return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1)); -} - -/// SubOne - Subtract one from a ConstantInt. -static Constant *SubOne(ConstantInt *C) { - return ConstantInt::get(C->getContext(), C->getValue()-1); +// If one of the operands only has one non-zero bit, and if the other +// operand has a known-zero bit in a more significant place than it (not +// including the sign bit) the ripple may go up to and fill the zero, but +// won't change the sign. For example, (X & ~4) + 1. +static bool checkRippleForAdd(const APInt &Op0KnownZero, + const APInt &Op1KnownZero) { + APInt Op1MaybeOne = ~Op1KnownZero; + // Make sure that one of the operand has at most one bit set to 1. + if (Op1MaybeOne.countPopulation() != 1) + return false; + + // Find the most significant known 0 other than the sign bit. + int BitWidth = Op0KnownZero.getBitWidth(); + APInt Op0KnownZeroTemp(Op0KnownZero); + Op0KnownZeroTemp.clearBit(BitWidth - 1); + int Op0ZeroPosition = BitWidth - Op0KnownZeroTemp.countLeadingZeros() - 1; + + int Op1OnePosition = BitWidth - Op1MaybeOne.countLeadingZeros() - 1; + assert(Op1OnePosition >= 0); + + // This also covers the case of no known zero, since in that case + // Op0ZeroPosition is -1. + return Op0ZeroPosition >= Op1OnePosition; } - -// dyn_castFoldableMul - If this value is a multiply that can be folded into -// other computations (because it has a constant operand), return the -// non-constant operand of the multiply, and set CST to point to the multiplier. -// Otherwise, return null. -// -static inline Value *dyn_castFoldableMul(Value *V, ConstantInt *&CST) { - if (!V->hasOneUse() || !V->getType()->isIntegerTy()) - return 0; - - Instruction *I = dyn_cast<Instruction>(V); - if (I == 0) return 0; - - if (I->getOpcode() == Instruction::Mul) - if ((CST = dyn_cast<ConstantInt>(I->getOperand(1)))) - return I->getOperand(0); - if (I->getOpcode() == Instruction::Shl) - if ((CST = dyn_cast<ConstantInt>(I->getOperand(1)))) { - // The multiplier is really 1 << CST. - uint32_t BitWidth = cast<IntegerType>(V->getType())->getBitWidth(); - uint32_t CSTVal = CST->getLimitedValue(BitWidth); - CST = ConstantInt::get(V->getType()->getContext(), - APInt::getOneBitSet(BitWidth, CSTVal)); - return I->getOperand(0); - } - return 0; -} - - /// WillNotOverflowSignedAdd - Return true if we can prove that: /// (sext (add LHS, RHS)) === (add (sext LHS), (sext RHS)) /// This basically requires proving that the add in the original type would not /// overflow to change the sign bit or have a carry out. +/// TODO: Handle this for Vectors. bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS) { // There are different heuristics we can use for this. Here are some simple // ones. - // Add has the property that adding any two 2's complement numbers can only - // have one carry bit which can change a sign. As such, if LHS and RHS each - // have at least two sign bits, we know that the addition of the two values - // will sign extend fine. + // If LHS and RHS each have at least two sign bits, the addition will look + // like + // + // XX..... + + // YY..... + // + // If the carry into the most significant position is 0, X and Y can't both + // be 1 and therefore the carry out of the addition is also 0. + // + // If the carry into the most significant position is 1, X and Y can't both + // be 0 and therefore the carry out of the addition is also 1. + // + // Since the carry into the most significant position is always equal to + // the carry out of the addition, there is no signed overflow. if (ComputeNumSignBits(LHS) > 1 && ComputeNumSignBits(RHS) > 1) return true; + if (IntegerType *IT = dyn_cast<IntegerType>(LHS->getType())) { + int BitWidth = IT->getBitWidth(); + APInt LHSKnownZero(BitWidth, 0); + APInt LHSKnownOne(BitWidth, 0); + computeKnownBits(LHS, LHSKnownZero, LHSKnownOne); + + APInt RHSKnownZero(BitWidth, 0); + APInt RHSKnownOne(BitWidth, 0); + computeKnownBits(RHS, RHSKnownZero, RHSKnownOne); + + // Addition of two 2's compliment numbers having opposite signs will never + // overflow. + if ((LHSKnownOne[BitWidth - 1] && RHSKnownZero[BitWidth - 1]) || + (LHSKnownZero[BitWidth - 1] && RHSKnownOne[BitWidth - 1])) + return true; + + // Check if carry bit of addition will not cause overflow. + if (checkRippleForAdd(LHSKnownZero, RHSKnownZero)) + return true; + if (checkRippleForAdd(RHSKnownZero, LHSKnownZero)) + return true; + } + return false; +} - // If one of the operands only has one non-zero bit, and if the other operand - // has a known-zero bit in a more significant place than it (not including the - // sign bit) the ripple may go up to and fill the zero, but won't change the - // sign. For example, (X & ~4) + 1. - - // TODO: Implement. +/// WillNotOverflowUnsignedAdd - Return true if we can prove that: +/// (zext (add LHS, RHS)) === (add (zext LHS), (zext RHS)) +bool InstCombiner::WillNotOverflowUnsignedAdd(Value *LHS, Value *RHS) { + // There are different heuristics we can use for this. Here is a simple one. + // If the sign bit of LHS and that of RHS are both zero, no unsigned wrap. + bool LHSKnownNonNegative, LHSKnownNegative; + bool RHSKnownNonNegative, RHSKnownNegative; + ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, 0); + ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, 0); + if (LHSKnownNonNegative && RHSKnownNonNegative) + return true; return false; } -Instruction *InstCombiner::visitAdd(BinaryOperator &I) { - bool Changed = SimplifyAssociativeOrCommutative(I); +// Checks if any operand is negative and we can convert add to sub. +// This function checks for following negative patterns +// ADD(XOR(OR(Z, NOT(C)), C)), 1) == NEG(AND(Z, C)) +// ADD(XOR(AND(Z, C), C), 1) == NEG(OR(Z, ~C)) +// XOR(AND(Z, C), (C + 1)) == NEG(OR(Z, ~C)) if C is even +static Value *checkForNegativeOperand(BinaryOperator &I, + InstCombiner::BuilderTy *Builder) { Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); - if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(), - I.hasNoUnsignedWrap(), TD)) - return ReplaceInstUsesWith(I, V); + // This function creates 2 instructions to replace ADD, we need at least one + // of LHS or RHS to have one use to ensure benefit in transform. + if (!LHS->hasOneUse() && !RHS->hasOneUse()) + return nullptr; + + Value *X = nullptr, *Y = nullptr, *Z = nullptr; + const APInt *C1 = nullptr, *C2 = nullptr; + + // if ONE is on other side, swap + if (match(RHS, m_Add(m_Value(X), m_One()))) + std::swap(LHS, RHS); + + if (match(LHS, m_Add(m_Value(X), m_One()))) { + // if XOR on other side, swap + if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1)))) + std::swap(X, RHS); + + if (match(X, m_Xor(m_Value(Y), m_APInt(C1)))) { + // X = XOR(Y, C1), Y = OR(Z, C2), C2 = NOT(C1) ==> X == NOT(AND(Z, C1)) + // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, AND(Z, C1)) + if (match(Y, m_Or(m_Value(Z), m_APInt(C2))) && (*C2 == ~(*C1))) { + Value *NewAnd = Builder->CreateAnd(Z, *C1); + return Builder->CreateSub(RHS, NewAnd, "sub"); + } else if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && (*C1 == *C2)) { + // X = XOR(Y, C1), Y = AND(Z, C2), C2 == C1 ==> X == NOT(OR(Z, ~C1)) + // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, OR(Z, ~C1)) + Value *NewOr = Builder->CreateOr(Z, ~(*C1)); + return Builder->CreateSub(RHS, NewOr, "sub"); + } + } + } - // (A*B)+(A*C) -> A*(B+C) etc + // Restore LHS and RHS + LHS = I.getOperand(0); + RHS = I.getOperand(1); + + // if XOR is on other side, swap + if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1)))) + std::swap(LHS, RHS); + + // C2 is ODD + // LHS = XOR(Y, C1), Y = AND(Z, C2), C1 == (C2 + 1) => LHS == NEG(OR(Z, ~C2)) + // ADD(LHS, RHS) == SUB(RHS, OR(Z, ~C2)) + if (match(LHS, m_Xor(m_Value(Y), m_APInt(C1)))) + if (C1->countTrailingZeros() == 0) + if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && *C1 == (*C2 + 1)) { + Value *NewOr = Builder->CreateOr(Z, ~(*C2)); + return Builder->CreateSub(RHS, NewOr, "sub"); + } + return nullptr; +} + +Instruction *InstCombiner::visitAdd(BinaryOperator &I) { + bool Changed = SimplifyAssociativeOrCommutative(I); + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + + if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(), + I.hasNoUnsignedWrap(), DL)) + return ReplaceInstUsesWith(I, V); + + // (A*B)+(A*C) -> A*(B+C) etc if (Value *V = SimplifyUsingDistributiveLaws(I)) return ReplaceInstUsesWith(I, V); @@ -938,7 +1048,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { if (ZI->getSrcTy()->isIntegerTy(1)) return SelectInst::Create(ZI->getOperand(0), AddOne(CI), CI); - Value *XorLHS = 0; ConstantInt *XorRHS = 0; + Value *XorLHS = nullptr; ConstantInt *XorRHS = nullptr; if (match(LHS, m_Xor(m_Value(XorLHS), m_ConstantInt(XorRHS)))) { uint32_t TySizeBits = I.getType()->getScalarSizeInBits(); const APInt &RHSVal = CI->getValue(); @@ -970,7 +1080,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { IntegerType *IT = cast<IntegerType>(I.getType()); APInt LHSKnownOne(IT->getBitWidth(), 0); APInt LHSKnownZero(IT->getBitWidth(), 0); - ComputeMaskedBits(XorLHS, LHSKnownZero, LHSKnownOne); + computeKnownBits(XorLHS, LHSKnownZero, LHSKnownOne); if ((XorRHS->getValue() | LHSKnownZero).isAllOnesValue()) return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI), XorLHS); @@ -987,7 +1097,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { if (Instruction *NV = FoldOpIntoPhi(I)) return NV; - if (I.getType()->isIntegerTy(1)) + if (I.getType()->getScalarType()->isIntegerTy(1)) return BinaryOperator::CreateXor(LHS, RHS); // X + X --> X << 1 @@ -1016,31 +1126,18 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { if (Value *V = dyn_castNegVal(RHS)) return BinaryOperator::CreateSub(LHS, V); - - ConstantInt *C2; - if (Value *X = dyn_castFoldableMul(LHS, C2)) { - if (X == RHS) // X*C + X --> X * (C+1) - return BinaryOperator::CreateMul(RHS, AddOne(C2)); - - // X*C1 + X*C2 --> X * (C1+C2) - ConstantInt *C1; - if (X == dyn_castFoldableMul(RHS, C1)) - return BinaryOperator::CreateMul(X, ConstantExpr::getAdd(C1, C2)); - } - - // X + X*C --> X * (C+1) - if (dyn_castFoldableMul(RHS, C2) == LHS) - return BinaryOperator::CreateMul(LHS, AddOne(C2)); + if (Value *V = checkForNegativeOperand(I, Builder)) + return ReplaceInstUsesWith(I, V); // A+B --> A|B iff A and B have no bits set in common. if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) { APInt LHSKnownOne(IT->getBitWidth(), 0); APInt LHSKnownZero(IT->getBitWidth(), 0); - ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne); + computeKnownBits(LHS, LHSKnownZero, LHSKnownOne); if (LHSKnownZero != 0) { APInt RHSKnownOne(IT->getBitWidth(), 0); APInt RHSKnownZero(IT->getBitWidth(), 0); - ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne); + computeKnownBits(RHS, RHSKnownZero, RHSKnownOne); // No bits in common -> bitwise or. if ((LHSKnownZero|RHSKnownZero).isAllOnesValue()) @@ -1048,35 +1145,16 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { } } - // W*X + Y*Z --> W * (X+Z) iff W == Y - { - Value *W, *X, *Y, *Z; - if (match(LHS, m_Mul(m_Value(W), m_Value(X))) && - match(RHS, m_Mul(m_Value(Y), m_Value(Z)))) { - if (W != Y) { - if (W == Z) { - std::swap(Y, Z); - } else if (Y == X) { - std::swap(W, X); - } else if (X == Z) { - std::swap(Y, Z); - std::swap(W, X); - } - } - - if (W == Y) { - Value *NewAdd = Builder->CreateAdd(X, Z, LHS->getName()); - return BinaryOperator::CreateMul(W, NewAdd); - } - } + if (Constant *CRHS = dyn_cast<Constant>(RHS)) { + Value *X; + if (match(LHS, m_Not(m_Value(X)))) // ~X + C --> (C-1) - X + return BinaryOperator::CreateSub(SubOne(CRHS), X); } if (ConstantInt *CRHS = dyn_cast<ConstantInt>(RHS)) { - Value *X = 0; - if (match(LHS, m_Not(m_Value(X)))) // ~X + C --> (C-1) - X - return BinaryOperator::CreateSub(SubOne(CRHS), X); - // (X & FF00) + xx00 -> (X+xx00) & FF00 + Value *X; + ConstantInt *C2; if (LHS->hasOneUse() && match(LHS, m_And(m_Value(X), m_ConstantInt(C2))) && CRHS->getValue() == (CRHS->getValue() & C2->getValue())) { @@ -1164,7 +1242,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { // Check for (x & y) + (x ^ y) { - Value *A = 0, *B = 0; + Value *A = nullptr, *B = nullptr; if (match(RHS, m_Xor(m_Value(A), m_Value(B))) && (match(LHS, m_And(m_Specific(A), m_Specific(B))) || match(LHS, m_And(m_Specific(B), m_Specific(A))))) @@ -1176,14 +1254,29 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { return BinaryOperator::CreateOr(A, B); } - return Changed ? &I : 0; + // TODO(jingyue): Consider WillNotOverflowSignedAdd and + // WillNotOverflowUnsignedAdd to reduce the number of invocations of + // computeKnownBits. + if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS)) { + Changed = true; + I.setHasNoSignedWrap(true); + } + if (!I.hasNoUnsignedWrap() && WillNotOverflowUnsignedAdd(LHS, RHS)) { + Changed = true; + I.setHasNoUnsignedWrap(true); + } + + return Changed ? &I : nullptr; } Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); - if (Value *V = SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), TD)) + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + + if (Value *V = SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), DL)) return ReplaceInstUsesWith(I, V); if (isa<Constant>(RHS)) { @@ -1198,13 +1291,19 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { // -A + B --> B - A // -A + -B --> -(A + B) - if (Value *LHSV = dyn_castFNegVal(LHS)) - return BinaryOperator::CreateFSub(RHS, LHSV); + if (Value *LHSV = dyn_castFNegVal(LHS)) { + Instruction *RI = BinaryOperator::CreateFSub(RHS, LHSV); + RI->copyFastMathFlags(&I); + return RI; + } // A + -B --> A - B if (!isa<Constant>(RHS)) - if (Value *V = dyn_castFNegVal(RHS)) - return BinaryOperator::CreateFSub(LHS, V); + if (Value *V = dyn_castFNegVal(RHS)) { + Instruction *RI = BinaryOperator::CreateFSub(LHS, V); + RI->copyFastMathFlags(&I); + return RI; + } // Check for (fadd double (sitofp x), y), see if we can merge this into an // integer add followed by a promotion. @@ -1250,7 +1349,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { if (match(LHS, m_Select(m_Value(C1), m_Value(A1), m_Value(B1))) && match(RHS, m_Select(m_Value(C2), m_Value(A2), m_Value(B2)))) { if (C1 == C2) { - Constant *Z1=0, *Z2=0; + Constant *Z1=nullptr, *Z2=nullptr; Value *A, *B, *C=C1; if (match(A1, m_AnyZero()) && match(B2, m_AnyZero())) { Z1 = dyn_cast<Constant>(A1); A = A2; @@ -1274,7 +1373,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { return ReplaceInstUsesWith(I, V); } - return Changed ? &I : 0; + return Changed ? &I : nullptr; } @@ -1284,12 +1383,12 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { /// Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS, Type *Ty) { - assert(TD && "Must have target data info for this"); + assert(DL && "Must have target data info for this"); // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize // this. bool Swapped = false; - GEPOperator *GEP1 = 0, *GEP2 = 0; + GEPOperator *GEP1 = nullptr, *GEP2 = nullptr; // For now we require one side to be the base pointer "A" or a constant // GEP derived from it. @@ -1327,9 +1426,9 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS, // Avoid duplicating the arithmetic if GEP2 has non-constant indices and // multiple users. - if (GEP1 == 0 || - (GEP2 != 0 && !GEP2->hasAllConstantIndices() && !GEP2->hasOneUse())) - return 0; + if (!GEP1 || + (GEP2 && !GEP2->hasAllConstantIndices() && !GEP2->hasOneUse())) + return nullptr; // Emit the offset of the GEP and an intptr_t. Value *Result = EmitGEPOffset(GEP1); @@ -1352,8 +1451,11 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS, Instruction *InstCombiner::visitSub(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + if (Value *V = SimplifySubInst(Op0, Op1, I.hasNoSignedWrap(), - I.hasNoUnsignedWrap(), TD)) + I.hasNoUnsignedWrap(), DL)) return ReplaceInstUsesWith(I, V); // (A*B)-(A*C) -> A*(B-C) etc @@ -1375,51 +1477,53 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { if (match(Op0, m_AllOnes())) return BinaryOperator::CreateNot(Op1); - if (ConstantInt *C = dyn_cast<ConstantInt>(Op0)) { + if (Constant *C = dyn_cast<Constant>(Op0)) { // C - ~X == X + (1+C) - Value *X = 0; + Value *X = nullptr; if (match(Op1, m_Not(m_Value(X)))) return BinaryOperator::CreateAdd(X, AddOne(C)); - // -(X >>u 31) -> (X >>s 31) - // -(X >>s 31) -> (X >>u 31) - if (C->isZero()) { - Value *X; ConstantInt *CI; - if (match(Op1, m_LShr(m_Value(X), m_ConstantInt(CI))) && - // Verify we are shifting out everything but the sign bit. - CI->getValue() == I.getType()->getPrimitiveSizeInBits()-1) - return BinaryOperator::CreateAShr(X, CI); - - if (match(Op1, m_AShr(m_Value(X), m_ConstantInt(CI))) && - // Verify we are shifting out everything but the sign bit. - CI->getValue() == I.getType()->getPrimitiveSizeInBits()-1) - return BinaryOperator::CreateLShr(X, CI); - } - // Try to fold constant sub into select arguments. if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) if (Instruction *R = FoldOpIntoSelect(I, SI)) return R; // C-(X+C2) --> (C-C2)-X - ConstantInt *C2; - if (match(Op1, m_Add(m_Value(X), m_ConstantInt(C2)))) + Constant *C2; + if (match(Op1, m_Add(m_Value(X), m_Constant(C2)))) return BinaryOperator::CreateSub(ConstantExpr::getSub(C, C2), X); if (SimplifyDemandedInstructionBits(I)) return &I; // Fold (sub 0, (zext bool to B)) --> (sext bool to B) - if (C->isZero() && match(Op1, m_ZExt(m_Value(X)))) - if (X->getType()->isIntegerTy(1)) + if (C->isNullValue() && match(Op1, m_ZExt(m_Value(X)))) + if (X->getType()->getScalarType()->isIntegerTy(1)) return CastInst::CreateSExtOrBitCast(X, Op1->getType()); // Fold (sub 0, (sext bool to B)) --> (zext bool to B) - if (C->isZero() && match(Op1, m_SExt(m_Value(X)))) - if (X->getType()->isIntegerTy(1)) + if (C->isNullValue() && match(Op1, m_SExt(m_Value(X)))) + if (X->getType()->getScalarType()->isIntegerTy(1)) return CastInst::CreateZExtOrBitCast(X, Op1->getType()); } + if (ConstantInt *C = dyn_cast<ConstantInt>(Op0)) { + // -(X >>u 31) -> (X >>s 31) + // -(X >>s 31) -> (X >>u 31) + if (C->isZero()) { + Value *X; ConstantInt *CI; + if (match(Op1, m_LShr(m_Value(X), m_ConstantInt(CI))) && + // Verify we are shifting out everything but the sign bit. + CI->getValue() == I.getType()->getPrimitiveSizeInBits()-1) + return BinaryOperator::CreateAShr(X, CI); + + if (match(Op1, m_AShr(m_Value(X), m_ConstantInt(CI))) && + // Verify we are shifting out everything but the sign bit. + CI->getValue() == I.getType()->getPrimitiveSizeInBits()-1) + return BinaryOperator::CreateLShr(X, CI); + } + } + { Value *Y; // X-(X+Y) == -Y X-(Y+X) == -Y @@ -1433,9 +1537,9 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { } if (Op1->hasOneUse()) { - Value *X = 0, *Y = 0, *Z = 0; - Constant *C = 0; - ConstantInt *CI = 0; + Value *X = nullptr, *Y = nullptr, *Z = nullptr; + Constant *C = nullptr; + Constant *CI = nullptr; // (X - (Y - Z)) --> (X + (Z - Y)). if (match(Op1, m_Sub(m_Value(Y), m_Value(Z)))) @@ -1449,9 +1553,9 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { return BinaryOperator::CreateAnd(Op0, Builder->CreateNot(Y, Y->getName() + ".not")); - // 0 - (X sdiv C) -> (X sdiv -C) - if (match(Op1, m_SDiv(m_Value(X), m_Constant(C))) && - match(Op0, m_Zero())) + // 0 - (X sdiv C) -> (X sdiv -C) provided the negation doesn't overflow. + if (match(Op1, m_SDiv(m_Value(X), m_Constant(C))) && match(Op0, m_Zero()) && + !C->isMinSignedValue()) return BinaryOperator::CreateSDiv(X, ConstantExpr::getNeg(C)); // 0 - (X << Y) -> (-X << Y) when X is freely negatable. @@ -1459,19 +1563,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { if (Value *XNeg = dyn_castNegVal(X)) return BinaryOperator::CreateShl(XNeg, Y); - // X - X*C --> X * (1-C) - if (match(Op1, m_Mul(m_Specific(Op0), m_ConstantInt(CI)))) { - Constant *CP1 = ConstantExpr::getSub(ConstantInt::get(I.getType(),1), CI); - return BinaryOperator::CreateMul(Op0, CP1); - } - - // X - X<<C --> X * (1-(1<<C)) - if (match(Op1, m_Shl(m_Specific(Op0), m_ConstantInt(CI)))) { - Constant *One = ConstantInt::get(I.getType(), 1); - C = ConstantExpr::getSub(One, ConstantExpr::getShl(One, CI)); - return BinaryOperator::CreateMul(Op0, C); - } - // X - A*-B -> X + A*B // X - -A*B -> X + A*B Value *A, *B; @@ -1481,26 +1572,16 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { // X - A*CI -> X + A*-CI // X - CI*A -> X + A*-CI - if (match(Op1, m_Mul(m_Value(A), m_ConstantInt(CI))) || - match(Op1, m_Mul(m_ConstantInt(CI), m_Value(A)))) { + if (match(Op1, m_Mul(m_Value(A), m_Constant(CI))) || + match(Op1, m_Mul(m_Constant(CI), m_Value(A)))) { Value *NewMul = Builder->CreateMul(A, ConstantExpr::getNeg(CI)); return BinaryOperator::CreateAdd(Op0, NewMul); } } - ConstantInt *C1; - if (Value *X = dyn_castFoldableMul(Op0, C1)) { - if (X == Op1) // X*C - X --> X * (C-1) - return BinaryOperator::CreateMul(Op1, SubOne(C1)); - - ConstantInt *C2; // X*C1 - X*C2 -> X * (C1-C2) - if (X == dyn_castFoldableMul(Op1, C2)) - return BinaryOperator::CreateMul(X, ConstantExpr::getSub(C1, C2)); - } - // Optimize pointer differences into the same array into a size. Consider: // &A[10] - &A[0]: we should compile this to "10". - if (TD) { + if (DL) { Value *LHSOp, *RHSOp; if (match(Op0, m_PtrToInt(m_Value(LHSOp))) && match(Op1, m_PtrToInt(m_Value(RHSOp)))) @@ -1512,15 +1593,18 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp))))) if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType())) return ReplaceInstUsesWith(I, Res); - } + } - return 0; + return nullptr; } Instruction *InstCombiner::visitFSub(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (Value *V = SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), TD)) + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + + if (Value *V = SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), DL)) return ReplaceInstUsesWith(I, V); if (isa<Constant>(Op0)) @@ -1556,5 +1640,5 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) { return ReplaceInstUsesWith(I, V); } - return 0; + return nullptr; } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 88bb69b..b23a606 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -13,22 +13,14 @@ #include "InstCombine.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/IR/ConstantRange.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/Support/ConstantRange.h" -#include "llvm/Support/PatternMatch.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Transforms/Utils/CmpInstAnalysis.h" using namespace llvm; using namespace PatternMatch; - -/// AddOne - Add one to a ConstantInt. -static Constant *AddOne(ConstantInt *C) { - return ConstantInt::get(C->getContext(), C->getValue() + 1); -} -/// SubOne - Subtract one from a ConstantInt. -static Constant *SubOne(ConstantInt *C) { - return ConstantInt::get(C->getContext(), C->getValue()-1); -} +#define DEBUG_TYPE "instcombine" /// isFreeToInvert - Return true if the specified value is free to invert (apply /// ~ to). This happens in cases where the ~ can be eliminated. @@ -60,7 +52,7 @@ static inline Value *dyn_castNotVal(Value *V) { // Constants can be considered to be not'ed values... if (ConstantInt *C = dyn_cast<ConstantInt>(V)) return ConstantInt::get(C->getType(), ~C->getValue()); - return 0; + return nullptr; } /// getFCmpCode - Similar to getICmpCode but for FCmpInst. This encodes a fcmp @@ -133,7 +125,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op, ConstantInt *AndRHS, BinaryOperator &TheAnd) { Value *X = Op->getOperand(0); - Constant *Together = 0; + Constant *Together = nullptr; if (!Op->isShift()) Together = ConstantExpr::getAnd(AndRHS, OpRHS); @@ -260,7 +252,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op, } break; } - return 0; + return nullptr; } /// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise @@ -342,12 +334,12 @@ Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS, Instruction &I) { Instruction *LHSI = dyn_cast<Instruction>(LHS); if (!LHSI || LHSI->getNumOperands() != 2 || - !isa<ConstantInt>(LHSI->getOperand(1))) return 0; + !isa<ConstantInt>(LHSI->getOperand(1))) return nullptr; ConstantInt *N = cast<ConstantInt>(LHSI->getOperand(1)); switch (LHSI->getOpcode()) { - default: return 0; + default: return nullptr; case Instruction::And: if (ConstantExpr::getAnd(N, Mask) == Mask) { // If the AndRHS is a power of two minus one (0+1+), this is simple. @@ -367,7 +359,7 @@ Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS, break; } } - return 0; + return nullptr; case Instruction::Or: case Instruction::Xor: // If the AndRHS is a power of two minus one (0+1+), and N&Mask == 0 @@ -375,7 +367,7 @@ Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS, Mask->getValue().countPopulation()) == Mask->getValue().getBitWidth() && ConstantExpr::getAnd(N, Mask)->isNullValue()) break; - return 0; + return nullptr; } if (isSub) @@ -428,12 +420,12 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C, ConstantInt *BCst = dyn_cast<ConstantInt>(B); ConstantInt *CCst = dyn_cast<ConstantInt>(C); bool icmp_eq = (SCC == ICmpInst::ICMP_EQ); - bool icmp_abit = (ACst != 0 && !ACst->isZero() && + bool icmp_abit = (ACst && !ACst->isZero() && ACst->getValue().isPowerOf2()); - bool icmp_bbit = (BCst != 0 && !BCst->isZero() && + bool icmp_bbit = (BCst && !BCst->isZero() && BCst->getValue().isPowerOf2()); unsigned result = 0; - if (CCst != 0 && CCst->isZero()) { + if (CCst && CCst->isZero()) { // if C is zero, then both A and B qualify as mask result |= (icmp_eq ? (FoldMskICmp_Mask_AllZeroes | FoldMskICmp_Mask_AllZeroes | @@ -465,7 +457,7 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C, FoldMskICmp_AMask_NotMixed) : (FoldMskICmp_Mask_AllZeroes | FoldMskICmp_AMask_Mixed)); - } else if (ACst != 0 && CCst != 0 && + } else if (ACst && CCst && ConstantExpr::getAnd(ACst, CCst) == CCst) { result |= (icmp_eq ? FoldMskICmp_AMask_Mixed : FoldMskICmp_AMask_NotMixed); @@ -480,7 +472,7 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C, FoldMskICmp_BMask_NotMixed) : (FoldMskICmp_Mask_AllZeroes | FoldMskICmp_BMask_Mixed)); - } else if (BCst != 0 && CCst != 0 && + } else if (BCst && CCst && ConstantExpr::getAnd(BCst, CCst) == CCst) { result |= (icmp_eq ? FoldMskICmp_BMask_Mixed : FoldMskICmp_BMask_NotMixed); @@ -513,31 +505,46 @@ static unsigned conjugateICmpMask(unsigned Mask) { /// decomposition fails. static bool decomposeBitTestICmp(const ICmpInst *I, ICmpInst::Predicate &Pred, Value *&X, Value *&Y, Value *&Z) { - // X < 0 is equivalent to (X & SignBit) != 0. - if (I->getPredicate() == ICmpInst::ICMP_SLT) - if (ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1))) - if (C->isZero()) { - X = I->getOperand(0); - Y = ConstantInt::get(I->getContext(), - APInt::getSignBit(C->getBitWidth())); - Pred = ICmpInst::ICMP_NE; - Z = C; - return true; - } + ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)); + if (!C) + return false; - // X > -1 is equivalent to (X & SignBit) == 0. - if (I->getPredicate() == ICmpInst::ICMP_SGT) - if (ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1))) - if (C->isAllOnesValue()) { - X = I->getOperand(0); - Y = ConstantInt::get(I->getContext(), - APInt::getSignBit(C->getBitWidth())); - Pred = ICmpInst::ICMP_EQ; - Z = ConstantInt::getNullValue(C->getType()); - return true; - } + switch (I->getPredicate()) { + default: + return false; + case ICmpInst::ICMP_SLT: + // X < 0 is equivalent to (X & SignBit) != 0. + if (!C->isZero()) + return false; + Y = ConstantInt::get(I->getContext(), APInt::getSignBit(C->getBitWidth())); + Pred = ICmpInst::ICMP_NE; + break; + case ICmpInst::ICMP_SGT: + // X > -1 is equivalent to (X & SignBit) == 0. + if (!C->isAllOnesValue()) + return false; + Y = ConstantInt::get(I->getContext(), APInt::getSignBit(C->getBitWidth())); + Pred = ICmpInst::ICMP_EQ; + break; + case ICmpInst::ICMP_ULT: + // X <u 2^n is equivalent to (X & ~(2^n-1)) == 0. + if (!C->getValue().isPowerOf2()) + return false; + Y = ConstantInt::get(I->getContext(), -C->getValue()); + Pred = ICmpInst::ICMP_EQ; + break; + case ICmpInst::ICMP_UGT: + // X >u 2^n-1 is equivalent to (X & ~(2^n-1)) != 0. + if (!(C->getValue() + 1).isPowerOf2()) + return false; + Y = ConstantInt::get(I->getContext(), ~C->getValue()); + Pred = ICmpInst::ICMP_NE; + break; + } - return false; + X = I->getOperand(0); + Z = ConstantInt::getNullValue(C->getType()); + return true; } /// foldLogOpOfMaskedICmpsHelper: @@ -565,12 +572,12 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, Value *L11,*L12,*L21,*L22; // Check whether the icmp can be decomposed into a bit test. if (decomposeBitTestICmp(LHS, LHSCC, L11, L12, L2)) { - L21 = L22 = L1 = 0; + L21 = L22 = L1 = nullptr; } else { // Look for ANDs in the LHS icmp. if (!L1->getType()->isIntegerTy()) { // You can icmp pointers, for example. They really aren't masks. - L11 = L12 = 0; + L11 = L12 = nullptr; } else if (!match(L1, m_And(m_Value(L11), m_Value(L12)))) { // Any icmp can be viewed as being trivially masked; if it allows us to // remove one, it's worth it. @@ -580,7 +587,7 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, if (!L2->getType()->isIntegerTy()) { // You can icmp pointers, for example. They really aren't masks. - L21 = L22 = 0; + L21 = L22 = nullptr; } else if (!match(L2, m_And(m_Value(L21), m_Value(L22)))) { L21 = L2; L22 = Constant::getAllOnesValue(L2->getType()); @@ -603,7 +610,7 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, } else { return 0; } - E = R2; R1 = 0; ok = true; + E = R2; R1 = nullptr; ok = true; } else if (R1->getType()->isIntegerTy()) { if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) { // As before, model no mask as a trivial mask if it'll let us do an @@ -660,11 +667,11 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, /// into a single (icmp(A & X) ==/!= Y) static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, llvm::InstCombiner::BuilderTy* Builder) { - Value *A = 0, *B = 0, *C = 0, *D = 0, *E = 0; + Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr; ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); unsigned mask = foldLogOpOfMaskedICmpsHelper(A, B, C, D, E, LHS, RHS, LHSCC, RHSCC); - if (mask == 0) return 0; + if (mask == 0) return nullptr; assert(ICmpInst::isEquality(LHSCC) && ICmpInst::isEquality(RHSCC) && "foldLogOpOfMaskedICmpsHelper must return an equality predicate."); @@ -717,9 +724,9 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, // their actual values. This isn't strictly, necessary, just a "handle the // easy cases for now" decision. ConstantInt *BCst = dyn_cast<ConstantInt>(B); - if (BCst == 0) return 0; + if (!BCst) return nullptr; ConstantInt *DCst = dyn_cast<ConstantInt>(D); - if (DCst == 0) return 0; + if (!DCst) return nullptr; if (mask & (FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_BMask_NotAllOnes)) { // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and @@ -758,11 +765,11 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, // (icmp ne (A & B), B) & (icmp eq (A & D), D) // with B and D, having a single bit set ConstantInt *CCst = dyn_cast<ConstantInt>(C); - if (CCst == 0) return 0; + if (!CCst) return nullptr; if (LHSCC != NEWCC) CCst = dyn_cast<ConstantInt>( ConstantExpr::getXor(BCst, CCst) ); ConstantInt *ECst = dyn_cast<ConstantInt>(E); - if (ECst == 0) return 0; + if (!ECst) return nullptr; if (RHSCC != NEWCC) ECst = dyn_cast<ConstantInt>( ConstantExpr::getXor(DCst, ECst) ); ConstantInt* MCst = dyn_cast<ConstantInt>( @@ -771,13 +778,13 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, // if there is a conflict we should actually return a false for the // whole construct if (!MCst->isZero()) - return 0; + return nullptr; Value *newOr1 = Builder->CreateOr(B, D); Value *newOr2 = ConstantExpr::getOr(CCst, ECst); Value *newAnd = Builder->CreateAnd(A, newOr1); return Builder->CreateICmp(NEWCC, newAnd, newOr2); } - return 0; + return nullptr; } /// FoldAndOfICmps - Fold (icmp)&(icmp) if possible. @@ -806,7 +813,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0); ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1)); ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1)); - if (LHSCst == 0 || RHSCst == 0) return 0; + if (!LHSCst || !RHSCst) return nullptr; if (LHSCst == RHSCst && LHSCC == RHSCC) { // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C) @@ -830,7 +837,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { if (LHSCC == ICmpInst::ICMP_EQ && LHSCC == RHSCC && LHS->hasOneUse() && RHS->hasOneUse()) { Value *V; - ConstantInt *AndCst, *SmallCst = 0, *BigCst = 0; + ConstantInt *AndCst, *SmallCst = nullptr, *BigCst = nullptr; // (trunc x) == C1 & (and x, CA) == C2 // (and x, CA) == C2 & (trunc x) == C1 @@ -861,14 +868,14 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { // From here on, we only handle: // (icmp1 A, C1) & (icmp2 A, C2) --> something simpler. - if (Val != Val2) return 0; + if (Val != Val2) return nullptr; // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere. if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE || RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE || LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE || RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE) - return 0; + return nullptr; // Make a constant range that's the intersection of the two icmp ranges. // If the intersection is empty, we know that the result is false. @@ -882,7 +889,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { // We can't fold (ugt x, C) & (sgt x, C2). if (!PredicatesFoldable(LHSCC, RHSCC)) - return 0; + return nullptr; // Ensure that the larger constant is on the RHS. bool ShouldSwap; @@ -1011,7 +1018,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { break; } - return 0; + return nullptr; } /// FoldAndOfFCmps - Optimize (fcmp)&(fcmp). NOTE: Unlike the rest of @@ -1021,7 +1028,7 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { if (LHS->getPredicate() == FCmpInst::FCMP_ORD && RHS->getPredicate() == FCmpInst::FCMP_ORD) { if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType()) - return 0; + return nullptr; // (fcmp ord x, c) & (fcmp ord y, c) -> (fcmp ord x, y) if (ConstantFP *LHSC = dyn_cast<ConstantFP>(LHS->getOperand(1))) @@ -1038,7 +1045,7 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { if (isa<ConstantAggregateZero>(LHS->getOperand(1)) && isa<ConstantAggregateZero>(RHS->getOperand(1))) return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0)); - return 0; + return nullptr; } Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1); @@ -1091,7 +1098,7 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { } } - return 0; + return nullptr; } @@ -1099,7 +1106,10 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (Value *V = SimplifyAndInst(Op0, Op1, TD)) + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + + if (Value *V = SimplifyAndInst(Op0, Op1, DL)) return ReplaceInstUsesWith(I, V); // (A|B)&(A|C) -> A|(B&C) etc @@ -1193,7 +1203,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { // If this is an integer truncation, and if the source is an 'and' with // immediate, transform it. This frequently occurs for bitfield accesses. { - Value *X = 0; ConstantInt *YC = 0; + Value *X = nullptr; ConstantInt *YC = nullptr; if (match(Op0, m_Trunc(m_And(m_Value(X), m_ConstantInt(YC))))) { // Change: and (trunc (and X, YC) to T), C2 // into : and (trunc X to T), trunc(YC) & C2 @@ -1226,7 +1236,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { } { - Value *A = 0, *B = 0, *C = 0, *D = 0; + Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr; // (A|B) & ~(A&B) -> A^B if (match(Op0, m_Or(m_Value(A), m_Value(B))) && match(Op1, m_Not(m_And(m_Value(C), m_Value(D)))) && @@ -1334,7 +1344,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { } { - Value *X = 0; + Value *X = nullptr; bool OpsSwapped = false; // Canonicalize SExt or Not to the LHS if (match(Op1, m_SExt(m_Value())) || @@ -1361,7 +1371,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { std::swap(Op0, Op1); } - return Changed ? &I : 0; + return Changed ? &I : nullptr; } /// CollectBSwapParts - Analyze the specified subexpression and see if it is @@ -1493,7 +1503,7 @@ Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) { if (!ITy || ITy->getBitWidth() % 16 || // ByteMask only allows up to 32-byte values. ITy->getBitWidth() > 32*8) - return 0; // Can only bswap pairs of bytes. Can't do vectors. + return nullptr; // Can only bswap pairs of bytes. Can't do vectors. /// ByteValues - For each byte of the result, we keep track of which value /// defines each byte. @@ -1503,16 +1513,16 @@ Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) { // Try to find all the pieces corresponding to the bswap. uint32_t ByteMask = ~0U >> (32-ByteValues.size()); if (CollectBSwapParts(&I, 0, ByteMask, ByteValues)) - return 0; + return nullptr; // Check to see if all of the bytes come from the same value. Value *V = ByteValues[0]; - if (V == 0) return 0; // Didn't find a byte? Must be zero. + if (!V) return nullptr; // Didn't find a byte? Must be zero. // Check to make sure that all of the bytes come from the same value. for (unsigned i = 1, e = ByteValues.size(); i != e; ++i) if (ByteValues[i] != V) - return 0; + return nullptr; Module *M = I.getParent()->getParent()->getParent(); Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, ITy); return CallInst::Create(F, V); @@ -1524,10 +1534,10 @@ Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) { static Instruction *MatchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D) { // If A is not a select of -1/0, this cannot match. - Value *Cond = 0; + Value *Cond = nullptr; if (!match(A, m_SExt(m_Value(Cond))) || !Cond->getType()->isIntegerTy(1)) - return 0; + return nullptr; // ((cond?-1:0)&C) | (B&(cond?0:-1)) -> cond ? C : B. if (match(D, m_Not(m_SExt(m_Specific(Cond))))) @@ -1540,24 +1550,7 @@ static Instruction *MatchSelectFromAndOr(Value *A, Value *B, return SelectInst::Create(Cond, C, D); if (match(B, m_SExt(m_Not(m_Specific(Cond))))) return SelectInst::Create(Cond, C, D); - return 0; -} - -/// IsOneHotValue - Returns true for "one-hot" values (values where at most -/// one bit can be set). -static bool IsOneHotValue(Value *V) { - // Match 1<<K. - if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) - if (BO->getOpcode() == Instruction::Shl) { - ConstantInt *One = dyn_cast<ConstantInt>(BO->getOperand(0)); - return One && One->isOne(); - } - - // Check for power of two integer constants. - if (ConstantInt *K = dyn_cast<ConstantInt>(V)) - return K->getValue().isPowerOf2(); - - return false; + return nullptr; } /// FoldOrOfICmps - Fold (icmp)|(icmp) if possible. @@ -1578,16 +1571,16 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { LAnd->getOpcode() == Instruction::And && RAnd->getOpcode() == Instruction::And) { - Value *Mask = 0; - Value *Masked = 0; + Value *Mask = nullptr; + Value *Masked = nullptr; if (LAnd->getOperand(0) == RAnd->getOperand(0) && - IsOneHotValue(LAnd->getOperand(1)) && - IsOneHotValue(RAnd->getOperand(1))) { + isKnownToBeAPowerOfTwo(LAnd->getOperand(1)) && + isKnownToBeAPowerOfTwo(RAnd->getOperand(1))) { Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1)); Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask); } else if (LAnd->getOperand(1) == RAnd->getOperand(1) && - IsOneHotValue(LAnd->getOperand(0)) && - IsOneHotValue(RAnd->getOperand(0))) { + isKnownToBeAPowerOfTwo(LAnd->getOperand(0)) && + isKnownToBeAPowerOfTwo(RAnd->getOperand(0))) { Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0)); Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask); } @@ -1620,7 +1613,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { if (LHS->hasOneUse() || RHS->hasOneUse()) { // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1) // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1) - Value *A = 0, *B = 0; + Value *A = nullptr, *B = nullptr; if (LHSCC == ICmpInst::ICMP_EQ && LHSCst && LHSCst->isZero()) { B = Val; if (RHSCC == ICmpInst::ICMP_ULT && Val == RHS->getOperand(1)) @@ -1644,7 +1637,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { } // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2). - if (LHSCst == 0 || RHSCst == 0) return 0; + if (!LHSCst || !RHSCst) return nullptr; if (LHSCst == RHSCst && LHSCC == RHSCC) { // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0) @@ -1665,18 +1658,18 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { // From here on, we only handle: // (icmp1 A, C1) | (icmp2 A, C2) --> something simpler. - if (Val != Val2) return 0; + if (Val != Val2) return nullptr; // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere. if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE || RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE || LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE || RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE) - return 0; + return nullptr; // We can't fold (ugt x, C) | (sgt x, C2). if (!PredicatesFoldable(LHSCC, RHSCC)) - return 0; + return nullptr; // Ensure that the larger constant is on the RHS. bool ShouldSwap; @@ -1821,7 +1814,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { } break; } - return 0; + return nullptr; } /// FoldOrOfFCmps - Optimize (fcmp)|(fcmp). NOTE: Unlike the rest of @@ -1849,7 +1842,7 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { isa<ConstantAggregateZero>(RHS->getOperand(1))) return Builder->CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0)); - return 0; + return nullptr; } Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1); @@ -1881,7 +1874,7 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { return getFCmpValue(Op0Ordered, Op0Pred|Op1Pred, Op0LHS, Op0RHS, Builder); } } - return 0; + return nullptr; } /// FoldOrWithConstants - This helper function folds: @@ -1896,28 +1889,31 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { Instruction *InstCombiner::FoldOrWithConstants(BinaryOperator &I, Value *Op, Value *A, Value *B, Value *C) { ConstantInt *CI1 = dyn_cast<ConstantInt>(C); - if (!CI1) return 0; + if (!CI1) return nullptr; - Value *V1 = 0; - ConstantInt *CI2 = 0; - if (!match(Op, m_And(m_Value(V1), m_ConstantInt(CI2)))) return 0; + Value *V1 = nullptr; + ConstantInt *CI2 = nullptr; + if (!match(Op, m_And(m_Value(V1), m_ConstantInt(CI2)))) return nullptr; APInt Xor = CI1->getValue() ^ CI2->getValue(); - if (!Xor.isAllOnesValue()) return 0; + if (!Xor.isAllOnesValue()) return nullptr; if (V1 == A || V1 == B) { Value *NewOp = Builder->CreateAnd((V1 == A) ? B : A, CI1); return BinaryOperator::CreateOr(NewOp, V1); } - return 0; + return nullptr; } Instruction *InstCombiner::visitOr(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (Value *V = SimplifyOrInst(Op0, Op1, TD)) + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + + if (Value *V = SimplifyOrInst(Op0, Op1, DL)) return ReplaceInstUsesWith(I, V); // (A&B)|(A&C) -> A&(B|C) etc @@ -1930,7 +1926,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { return &I; if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { - ConstantInt *C1 = 0; Value *X = 0; + ConstantInt *C1 = nullptr; Value *X = nullptr; // (X & C1) | C2 --> (X | C2) & (C1|C2) // iff (C1 & C2) == 0. if (match(Op0, m_And(m_Value(X), m_ConstantInt(C1))) && @@ -1961,8 +1957,8 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { return NV; } - Value *A = 0, *B = 0; - ConstantInt *C1 = 0, *C2 = 0; + Value *A = nullptr, *B = nullptr; + ConstantInt *C1 = nullptr, *C2 = nullptr; // (A | B) | C and A | (B | C) -> bswap if possible. // (A >> B) | (C << D) and (A << B) | (B >> C) -> bswap if possible. @@ -1993,36 +1989,13 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { } // (A & C)|(B & D) - Value *C = 0, *D = 0; + Value *C = nullptr, *D = nullptr; if (match(Op0, m_And(m_Value(A), m_Value(C))) && match(Op1, m_And(m_Value(B), m_Value(D)))) { - Value *V1 = 0, *V2 = 0; + Value *V1 = nullptr, *V2 = nullptr; C1 = dyn_cast<ConstantInt>(C); C2 = dyn_cast<ConstantInt>(D); if (C1 && C2) { // (A & C1)|(B & C2) - // If we have: ((V + N) & C1) | (V & C2) - // .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 - // replace with V+N. - if (C1->getValue() == ~C2->getValue()) { - if ((C2->getValue() & (C2->getValue()+1)) == 0 && // C2 == 0+1+ - match(A, m_Add(m_Value(V1), m_Value(V2)))) { - // Add commutes, try both ways. - if (V1 == B && MaskedValueIsZero(V2, C2->getValue())) - return ReplaceInstUsesWith(I, A); - if (V2 == B && MaskedValueIsZero(V1, C2->getValue())) - return ReplaceInstUsesWith(I, A); - } - // Or commutes, try both ways. - if ((C1->getValue() & (C1->getValue()+1)) == 0 && - match(B, m_Add(m_Value(V1), m_Value(V2)))) { - // Add commutes, try both ways. - if (V1 == A && MaskedValueIsZero(V2, C1->getValue())) - return ReplaceInstUsesWith(I, B); - if (V2 == A && MaskedValueIsZero(V1, C1->getValue())) - return ReplaceInstUsesWith(I, B); - } - } - if ((C1->getValue() & C2->getValue()) == 0) { // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2) // iff (C1&C2) == 0 and (N&~C1) == 0 @@ -2040,7 +2013,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2) // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0. - ConstantInt *C3 = 0, *C4 = 0; + ConstantInt *C3 = nullptr, *C4 = nullptr; if (match(A, m_Or(m_Value(V1), m_ConstantInt(C3))) && (C3->getValue() & ~C1->getValue()) == 0 && match(B, m_Or(m_Specific(V1), m_ConstantInt(C4))) && @@ -2232,7 +2205,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { // Since this OR statement hasn't been optimized further yet, we hope // that this transformation will allow the new ORs to be optimized. { - Value *X = 0, *Y = 0; + Value *X = nullptr, *Y = nullptr; if (Op0->hasOneUse() && Op1->hasOneUse() && match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) && match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) { @@ -2242,14 +2215,17 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { } } - return Changed ? &I : 0; + return Changed ? &I : nullptr; } Instruction *InstCombiner::visitXor(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (Value *V = SimplifyXorInst(Op0, Op1, TD)) + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + + if (Value *V = SimplifyXorInst(Op0, Op1, DL)) return ReplaceInstUsesWith(I, V); // (A&B)^(A&C) -> A&(B^C) etc @@ -2506,5 +2482,5 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { } } - return Changed ? &I : 0; + return Changed ? &I : nullptr; } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 0cd7b14..658178d 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -14,14 +14,16 @@ #include "InstCombine.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" -#include "llvm/Support/CallSite.h" -#include "llvm/Support/PatternMatch.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; using namespace PatternMatch; +#define DEBUG_TYPE "instcombine" + STATISTIC(NumSimplified, "Number of library calls simplified"); /// getPromotedType - Return the specified type promoted as it would be to pass @@ -56,8 +58,8 @@ static Type *reduceToSingleValueType(Type *T) { } Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { - unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), TD); - unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), TD); + unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL); + unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL); unsigned MinAlign = std::min(DstAlign, SrcAlign); unsigned CopyAlign = MI->getAlignment(); @@ -70,17 +72,17 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with // load/store. ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2)); - if (MemOpLength == 0) return 0; + if (!MemOpLength) return nullptr; // Source and destination pointer types are always "i8*" for intrinsic. See // if the size is something we can handle with a single primitive load/store. // A single load+store correctly handles overlapping memory in the memmove // case. uint64_t Size = MemOpLength->getLimitedValue(); - assert(Size && "0-sized memory transfering should be removed already."); + assert(Size && "0-sized memory transferring should be removed already."); if (Size > 8 || (Size&(Size-1))) - return 0; // If not 1/2/4/8 bytes, exit. + return nullptr; // If not 1/2/4/8 bytes, exit. // Use an integer load+store unless we can find something better. unsigned SrcAddrSp = @@ -99,11 +101,11 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { // dest address will be promotable. See if we can find a better type than the // integer datatype. Value *StrippedDest = MI->getArgOperand(0)->stripPointerCasts(); - MDNode *CopyMD = 0; + MDNode *CopyMD = nullptr; if (StrippedDest != MI->getArgOperand(0)) { Type *SrcETy = cast<PointerType>(StrippedDest->getType()) ->getElementType(); - if (TD && SrcETy->isSized() && TD->getTypeStoreSize(SrcETy) == Size) { + if (DL && SrcETy->isSized() && DL->getTypeStoreSize(SrcETy) == Size) { // The SrcETy might be something like {{{double}}} or [1 x double]. Rip // down through these levels if so. SrcETy = reduceToSingleValueType(SrcETy); @@ -152,7 +154,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { } Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { - unsigned Alignment = getKnownAlignment(MI->getDest(), TD); + unsigned Alignment = getKnownAlignment(MI->getDest(), DL); if (MI->getAlignment() < Alignment) { MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Alignment, false)); @@ -163,7 +165,7 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength()); ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue()); if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8)) - return 0; + return nullptr; uint64_t Len = LenC->getLimitedValue(); Alignment = MI->getAlignment(); assert(Len && "0-sized memory setting should be removed already."); @@ -191,7 +193,7 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { return MI; } - return 0; + return nullptr; } /// visitCallInst - CallInst simplification. This mostly only handles folding @@ -233,7 +235,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // No other transformations apply to volatile transfers. if (MI->isVolatile()) - return 0; + return nullptr; // If we have a memmove and the source operation is a constant global, // then the source and dest pointers can't alias, so we can change this @@ -274,13 +276,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { default: break; case Intrinsic::objectsize: { uint64_t Size; - if (getObjectSize(II->getArgOperand(0), Size, TD, TLI)) + if (getObjectSize(II->getArgOperand(0), Size, DL, TLI)) return ReplaceInstUsesWith(CI, ConstantInt::get(CI.getType(), Size)); - return 0; + return nullptr; } case Intrinsic::bswap: { Value *IIOperand = II->getArgOperand(0); - Value *X = 0; + Value *X = nullptr; // bswap(bswap(x)) -> x if (match(IIOperand, m_BSwap(m_Value(X)))) @@ -320,7 +322,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { uint32_t BitWidth = IT->getBitWidth(); APInt KnownZero(BitWidth, 0); APInt KnownOne(BitWidth, 0); - ComputeMaskedBits(II->getArgOperand(0), KnownZero, KnownOne); + computeKnownBits(II->getArgOperand(0), KnownZero, KnownOne); unsigned TrailingZeros = KnownOne.countTrailingZeros(); APInt Mask(APInt::getLowBitsSet(BitWidth, TrailingZeros)); if ((Mask & KnownZero) == Mask) @@ -338,7 +340,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { uint32_t BitWidth = IT->getBitWidth(); APInt KnownZero(BitWidth, 0); APInt KnownOne(BitWidth, 0); - ComputeMaskedBits(II->getArgOperand(0), KnownZero, KnownOne); + computeKnownBits(II->getArgOperand(0), KnownZero, KnownOne); unsigned LeadingZeros = KnownOne.countLeadingZeros(); APInt Mask(APInt::getHighBitsSet(BitWidth, LeadingZeros)); if ((Mask & KnownZero) == Mask) @@ -353,14 +355,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { uint32_t BitWidth = IT->getBitWidth(); APInt LHSKnownZero(BitWidth, 0); APInt LHSKnownOne(BitWidth, 0); - ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne); + computeKnownBits(LHS, LHSKnownZero, LHSKnownOne); bool LHSKnownNegative = LHSKnownOne[BitWidth - 1]; bool LHSKnownPositive = LHSKnownZero[BitWidth - 1]; if (LHSKnownNegative || LHSKnownPositive) { APInt RHSKnownZero(BitWidth, 0); APInt RHSKnownOne(BitWidth, 0); - ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne); + computeKnownBits(RHS, RHSKnownZero, RHSKnownOne); bool RHSKnownNegative = RHSKnownOne[BitWidth - 1]; bool RHSKnownPositive = RHSKnownZero[BitWidth - 1]; if (LHSKnownNegative && RHSKnownNegative) { @@ -419,6 +421,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return InsertValueInst::Create(Struct, II->getArgOperand(0), 0); } } + + // We can strength reduce reduce this signed add into a regular add if we + // can prove that it will never overflow. + if (II->getIntrinsicID() == Intrinsic::sadd_with_overflow) { + Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); + if (WillNotOverflowSignedAdd(LHS, RHS)) { + Value *Add = Builder->CreateNSWAdd(LHS, RHS); + Add->takeName(&CI); + Constant *V[] = {UndefValue::get(Add->getType()), Builder->getFalse()}; + StructType *ST = cast<StructType>(II->getType()); + Constant *Struct = ConstantStruct::get(ST, V); + return InsertValueInst::Create(Struct, Add, 0); + } + } + break; case Intrinsic::usub_with_overflow: case Intrinsic::ssub_with_overflow: @@ -447,10 +464,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { APInt LHSKnownZero(BitWidth, 0); APInt LHSKnownOne(BitWidth, 0); - ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne); + computeKnownBits(LHS, LHSKnownZero, LHSKnownOne); APInt RHSKnownZero(BitWidth, 0); APInt RHSKnownOne(BitWidth, 0); - ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne); + computeKnownBits(RHS, RHSKnownZero, RHSKnownOne); // Get the largest possible values for each operand. APInt LHSMax = ~LHSKnownZero; @@ -504,7 +521,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: // Turn PPC lvx -> load if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, TD) >= 16) { + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL) >= 16) { Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), PointerType::getUnqual(II->getType())); return new LoadInst(Ptr); @@ -513,7 +530,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: // Turn stvx -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, TD) >= 16) { + if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL) >= 16) { Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType()); Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy); @@ -524,7 +541,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: // Turn X86 storeu -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, TD) >= 16) { + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL) >= 16) { Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(1)->getType()); Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy); @@ -554,6 +571,79 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + // Constant fold <A x Bi> << Ci. + // FIXME: We don't handle _dq because it's a shift of an i128, but is + // represented in the IR as <2 x i64>. A per element shift is wrong. + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: { + // Simplify if count is constant. To 0 if >= BitWidth, + // otherwise to shl/lshr. + auto CDV = dyn_cast<ConstantDataVector>(II->getArgOperand(1)); + auto CInt = dyn_cast<ConstantInt>(II->getArgOperand(1)); + if (!CDV && !CInt) + break; + ConstantInt *Count; + if (CDV) + Count = cast<ConstantInt>(CDV->getElementAsConstant(0)); + else + Count = CInt; + + auto Vec = II->getArgOperand(0); + auto VT = cast<VectorType>(Vec->getType()); + if (Count->getZExtValue() > + VT->getElementType()->getPrimitiveSizeInBits() - 1) + return ReplaceInstUsesWith( + CI, ConstantAggregateZero::get(Vec->getType())); + + bool isPackedShiftLeft = true; + switch (II->getIntrinsicID()) { + default : break; + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: isPackedShiftLeft = false; break; + } + + unsigned VWidth = VT->getNumElements(); + // Get a constant vector of the same type as the first operand. + auto VTCI = ConstantInt::get(VT->getElementType(), Count->getZExtValue()); + if (isPackedShiftLeft) + return BinaryOperator::CreateShl(Vec, + Builder->CreateVectorSplat(VWidth, VTCI)); + + return BinaryOperator::CreateLShr(Vec, + Builder->CreateVectorSplat(VWidth, VTCI)); + } case Intrinsic::x86_sse41_pmovsxbw: case Intrinsic::x86_sse41_pmovsxwd: @@ -576,8 +666,160 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::x86_sse4a_insertqi: { + // insertqi x, y, 64, 0 can just copy y's lower bits and leave the top + // ones undef + // TODO: eventually we should lower this intrinsic to IR + if (auto CIWidth = dyn_cast<ConstantInt>(II->getArgOperand(2))) { + if (auto CIStart = dyn_cast<ConstantInt>(II->getArgOperand(3))) { + if (CIWidth->equalsInt(64) && CIStart->isZero()) { + Value *Vec = II->getArgOperand(1); + Value *Undef = UndefValue::get(Vec->getType()); + const uint32_t Mask[] = { 0, 2 }; + return ReplaceInstUsesWith( + CI, + Builder->CreateShuffleVector( + Vec, Undef, ConstantDataVector::get( + II->getContext(), ArrayRef<uint32_t>(Mask)))); + + } else if (auto Source = + dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { + if (Source->hasOneUse() && + Source->getArgOperand(1) == II->getArgOperand(1)) { + // If the source of the insert has only one use and it's another + // insert (and they're both inserting from the same vector), try to + // bundle both together. + auto CISourceWidth = + dyn_cast<ConstantInt>(Source->getArgOperand(2)); + auto CISourceStart = + dyn_cast<ConstantInt>(Source->getArgOperand(3)); + if (CISourceStart && CISourceWidth) { + unsigned Start = CIStart->getZExtValue(); + unsigned Width = CIWidth->getZExtValue(); + unsigned End = Start + Width; + unsigned SourceStart = CISourceStart->getZExtValue(); + unsigned SourceWidth = CISourceWidth->getZExtValue(); + unsigned SourceEnd = SourceStart + SourceWidth; + unsigned NewStart, NewWidth; + bool ShouldReplace = false; + if (Start <= SourceStart && SourceStart <= End) { + NewStart = Start; + NewWidth = std::max(End, SourceEnd) - NewStart; + ShouldReplace = true; + } else if (SourceStart <= Start && Start <= SourceEnd) { + NewStart = SourceStart; + NewWidth = std::max(SourceEnd, End) - NewStart; + ShouldReplace = true; + } + + if (ShouldReplace) { + Constant *ConstantWidth = ConstantInt::get( + II->getArgOperand(2)->getType(), NewWidth, false); + Constant *ConstantStart = ConstantInt::get( + II->getArgOperand(3)->getType(), NewStart, false); + Value *Args[4] = { Source->getArgOperand(0), + II->getArgOperand(1), ConstantWidth, + ConstantStart }; + Module *M = CI.getParent()->getParent()->getParent(); + Value *F = + Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); + return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args)); + } + } + } + } + } + } + break; + } + + case Intrinsic::x86_sse41_pblendvb: + case Intrinsic::x86_sse41_blendvps: + case Intrinsic::x86_sse41_blendvpd: + case Intrinsic::x86_avx_blendv_ps_256: + case Intrinsic::x86_avx_blendv_pd_256: + case Intrinsic::x86_avx2_pblendvb: { + // Convert blendv* to vector selects if the mask is constant. + // This optimization is convoluted because the intrinsic is defined as + // getting a vector of floats or doubles for the ps and pd versions. + // FIXME: That should be changed. + Value *Mask = II->getArgOperand(2); + if (auto C = dyn_cast<ConstantDataVector>(Mask)) { + auto Tyi1 = Builder->getInt1Ty(); + auto SelectorType = cast<VectorType>(Mask->getType()); + auto EltTy = SelectorType->getElementType(); + unsigned Size = SelectorType->getNumElements(); + unsigned BitWidth = + EltTy->isFloatTy() + ? 32 + : (EltTy->isDoubleTy() ? 64 : EltTy->getIntegerBitWidth()); + assert((BitWidth == 64 || BitWidth == 32 || BitWidth == 8) && + "Wrong arguments for variable blend intrinsic"); + SmallVector<Constant *, 32> Selectors; + for (unsigned I = 0; I < Size; ++I) { + // The intrinsics only read the top bit + uint64_t Selector; + if (BitWidth == 8) + Selector = C->getElementAsInteger(I); + else + Selector = C->getElementAsAPFloat(I).bitcastToAPInt().getZExtValue(); + Selectors.push_back(ConstantInt::get(Tyi1, Selector >> (BitWidth - 1))); + } + auto NewSelector = ConstantVector::get(Selectors); + return SelectInst::Create(NewSelector, II->getArgOperand(1), + II->getArgOperand(0), "blendv"); + } else { + break; + } + } + + case Intrinsic::x86_avx_vpermilvar_ps: + case Intrinsic::x86_avx_vpermilvar_ps_256: + case Intrinsic::x86_avx_vpermilvar_pd: + case Intrinsic::x86_avx_vpermilvar_pd_256: { + // Convert vpermil* to shufflevector if the mask is constant. + Value *V = II->getArgOperand(1); + unsigned Size = cast<VectorType>(V->getType())->getNumElements(); + assert(Size == 8 || Size == 4 || Size == 2); + uint32_t Indexes[8]; + if (auto C = dyn_cast<ConstantDataVector>(V)) { + // The intrinsics only read one or two bits, clear the rest. + for (unsigned I = 0; I < Size; ++I) { + uint32_t Index = C->getElementAsInteger(I) & 0x3; + if (II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd || + II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256) + Index >>= 1; + Indexes[I] = Index; + } + } else if (isa<ConstantAggregateZero>(V)) { + for (unsigned I = 0; I < Size; ++I) + Indexes[I] = 0; + } else { + break; + } + // The _256 variants are a bit trickier since the mask bits always index + // into the corresponding 128 half. In order to convert to a generic + // shuffle, we have to make that explicit. + if (II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_ps_256 || + II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256) { + for (unsigned I = Size / 2; I < Size; ++I) + Indexes[I] += Size / 2; + } + auto NewC = + ConstantDataVector::get(V->getContext(), makeArrayRef(Indexes, Size)); + auto V1 = II->getArgOperand(0); + auto V2 = UndefValue::get(V1->getType()); + auto Shuffle = Builder->CreateShuffleVector(V1, V2, NewC); + return ReplaceInstUsesWith(CI, Shuffle); + } + case Intrinsic::ppc_altivec_vperm: // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. + // Note that ppc_altivec_vperm has a big-endian bias, so when creating + // a vectorshuffle for little endian, we must undo the transformation + // performed on vec_perm in altivec.h. That is, we must complement + // the permutation mask with respect to 31 and reverse the order of + // V1 and V2. if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) { assert(Mask->getType()->getVectorNumElements() == 16 && "Bad type for intrinsic!"); @@ -586,8 +828,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { bool AllEltsOk = true; for (unsigned i = 0; i != 16; ++i) { Constant *Elt = Mask->getAggregateElement(i); - if (Elt == 0 || - !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) { + if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) { AllEltsOk = false; break; } @@ -611,10 +852,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { unsigned Idx = cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue(); Idx &= 31; // Match the hardware behavior. + if (DL && DL->isLittleEndian()) + Idx = 31 - Idx; - if (ExtractedElts[Idx] == 0) { + if (!ExtractedElts[Idx]) { + Value *Op0ToUse = (DL && DL->isLittleEndian()) ? Op1 : Op0; + Value *Op1ToUse = (DL && DL->isLittleEndian()) ? Op0 : Op1; ExtractedElts[Idx] = - Builder->CreateExtractElement(Idx < 16 ? Op0 : Op1, + Builder->CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse, Builder->getInt32(Idx&15)); } @@ -641,7 +886,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::arm_neon_vst2lane: case Intrinsic::arm_neon_vst3lane: case Intrinsic::arm_neon_vst4lane: { - unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), TD); + unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), DL); unsigned AlignArg = II->getNumArgOperands() - 1; ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg)); if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) { @@ -654,7 +899,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { } case Intrinsic::arm_neon_vmulls: - case Intrinsic::arm_neon_vmullu: { + case Intrinsic::arm_neon_vmullu: + case Intrinsic::aarch64_neon_smull: + case Intrinsic::aarch64_neon_umull: { Value *Arg0 = II->getArgOperand(0); Value *Arg1 = II->getArgOperand(1); @@ -664,46 +911,46 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { } // Check for constant LHS & RHS - in this case we just simplify. - bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu); + bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu || + II->getIntrinsicID() == Intrinsic::aarch64_neon_umull); VectorType *NewVT = cast<VectorType>(II->getType()); - unsigned NewWidth = NewVT->getElementType()->getIntegerBitWidth(); - if (ConstantDataVector *CV0 = dyn_cast<ConstantDataVector>(Arg0)) { - if (ConstantDataVector *CV1 = dyn_cast<ConstantDataVector>(Arg1)) { - VectorType* VT = cast<VectorType>(CV0->getType()); - SmallVector<Constant*, 4> NewElems; - for (unsigned i = 0; i < VT->getNumElements(); ++i) { - APInt CV0E = - (cast<ConstantInt>(CV0->getAggregateElement(i)))->getValue(); - CV0E = Zext ? CV0E.zext(NewWidth) : CV0E.sext(NewWidth); - APInt CV1E = - (cast<ConstantInt>(CV1->getAggregateElement(i)))->getValue(); - CV1E = Zext ? CV1E.zext(NewWidth) : CV1E.sext(NewWidth); - NewElems.push_back( - ConstantInt::get(NewVT->getElementType(), CV0E * CV1E)); - } - return ReplaceInstUsesWith(CI, ConstantVector::get(NewElems)); + if (Constant *CV0 = dyn_cast<Constant>(Arg0)) { + if (Constant *CV1 = dyn_cast<Constant>(Arg1)) { + CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext); + CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext); + + return ReplaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1)); } - // Couldn't simplify - cannonicalize constant to the RHS. + // Couldn't simplify - canonicalize constant to the RHS. std::swap(Arg0, Arg1); } // Handle mul by one: - if (ConstantDataVector *CV1 = dyn_cast<ConstantDataVector>(Arg1)) { + if (Constant *CV1 = dyn_cast<Constant>(Arg1)) if (ConstantInt *Splat = - dyn_cast_or_null<ConstantInt>(CV1->getSplatValue())) { - if (Splat->isOne()) { - if (Zext) - return CastInst::CreateZExtOrBitCast(Arg0, II->getType()); - // else - return CastInst::CreateSExtOrBitCast(Arg0, II->getType()); - } - } - } + dyn_cast_or_null<ConstantInt>(CV1->getSplatValue())) + if (Splat->isOne()) + return CastInst::CreateIntegerCast(Arg0, II->getType(), + /*isSigned=*/!Zext); break; } + case Intrinsic::AMDGPU_rcp: { + if (const ConstantFP *C = dyn_cast<ConstantFP>(II->getArgOperand(0))) { + const APFloat &ArgVal = C->getValueAPF(); + APFloat Val(ArgVal.getSemantics(), 1.0); + APFloat::opStatus Status = Val.divide(ArgVal, + APFloat::rmNearestTiesToEven); + // Only do this if it was exact and therefore not dependent on the + // rounding mode. + if (Status == APFloat::opOK) + return ReplaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val)); + } + + break; + } case Intrinsic::stackrestore: { // If the save is right next to the restore, remove the restore. This can // happen when variable allocas are DCE'd. @@ -762,15 +1009,15 @@ Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) { /// passed through the varargs area, we can eliminate the use of the cast. static bool isSafeToEliminateVarargsCast(const CallSite CS, const CastInst * const CI, - const DataLayout * const TD, + const DataLayout * const DL, const int ix) { if (!CI->isLosslessCast()) return false; - // The size of ByVal arguments is derived from the type, so we + // The size of ByVal or InAlloca arguments is derived from the type, so we // can't change to a type with a different size. If the size were // passed explicitly we could avoid this check. - if (!CS.isByValArgument(ix)) + if (!CS.isByValOrInAllocaArgument(ix)) return true; Type* SrcTy = @@ -778,7 +1025,7 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS, Type* DstTy = cast<PointerType>(CI->getType())->getElementType(); if (!SrcTy->isSized() || !DstTy->isSized()) return false; - if (!TD || TD->getTypeAllocSize(SrcTy) != TD->getTypeAllocSize(DstTy)) + if (!DL || DL->getTypeAllocSize(SrcTy) != DL->getTypeAllocSize(DstTy)) return false; return true; } @@ -787,15 +1034,15 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS, // Currently we're only working with the checking functions, memcpy_chk, // mempcpy_chk, memmove_chk, memset_chk, strcpy_chk, stpcpy_chk, strncpy_chk, // strcat_chk and strncat_chk. -Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const DataLayout *TD) { - if (CI->getCalledFunction() == 0) return 0; +Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const DataLayout *DL) { + if (!CI->getCalledFunction()) return nullptr; if (Value *With = Simplifier->optimizeCall(CI)) { ++NumSimplified; return CI->use_empty() ? CI : ReplaceInstUsesWith(*CI, With); } - return 0; + return nullptr; } static IntrinsicInst *FindInitTrampolineFromAlloca(Value *TrampMem) { @@ -803,37 +1050,36 @@ static IntrinsicInst *FindInitTrampolineFromAlloca(Value *TrampMem) { // is good enough in practice and simpler than handling any number of casts. Value *Underlying = TrampMem->stripPointerCasts(); if (Underlying != TrampMem && - (!Underlying->hasOneUse() || *Underlying->use_begin() != TrampMem)) - return 0; + (!Underlying->hasOneUse() || Underlying->user_back() != TrampMem)) + return nullptr; if (!isa<AllocaInst>(Underlying)) - return 0; + return nullptr; - IntrinsicInst *InitTrampoline = 0; - for (Value::use_iterator I = TrampMem->use_begin(), E = TrampMem->use_end(); - I != E; I++) { - IntrinsicInst *II = dyn_cast<IntrinsicInst>(*I); + IntrinsicInst *InitTrampoline = nullptr; + for (User *U : TrampMem->users()) { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(U); if (!II) - return 0; + return nullptr; if (II->getIntrinsicID() == Intrinsic::init_trampoline) { if (InitTrampoline) // More than one init_trampoline writes to this value. Give up. - return 0; + return nullptr; InitTrampoline = II; continue; } if (II->getIntrinsicID() == Intrinsic::adjust_trampoline) // Allow any number of calls to adjust.trampoline. continue; - return 0; + return nullptr; } // No call to init.trampoline found. if (!InitTrampoline) - return 0; + return nullptr; // Check that the alloca is being used in the expected way. if (InitTrampoline->getOperand(0) != TrampMem) - return 0; + return nullptr; return InitTrampoline; } @@ -850,9 +1096,9 @@ static IntrinsicInst *FindInitTrampolineFromBB(IntrinsicInst *AdjustTramp, II->getOperand(0) == TrampMem) return II; if (Inst->mayWriteToMemory()) - return 0; + return nullptr; } - return 0; + return nullptr; } // Given a call to llvm.adjust.trampoline, find and return the corresponding @@ -864,7 +1110,7 @@ static IntrinsicInst *FindInitTrampoline(Value *Callee) { IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee); if (!AdjustTramp || AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline) - return 0; + return nullptr; Value *TrampMem = AdjustTramp->getOperand(0); @@ -872,7 +1118,7 @@ static IntrinsicInst *FindInitTrampoline(Value *Callee) { return IT; if (IntrinsicInst *IT = FindInitTrampolineFromBB(AdjustTramp, TrampMem)) return IT; - return 0; + return nullptr; } // visitCallSite - Improvements for call and invoke instructions. @@ -887,7 +1133,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { // arguments of the call/invoke. Value *Callee = CS.getCalledValue(); if (!isa<Function>(Callee) && transformConstExprCastCall(CS)) - return 0; + return nullptr; if (Function *CalleeF = dyn_cast<Function>(Callee)) // If the call and callee calling conventions don't match, this call must @@ -912,7 +1158,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { // change the callee to a null pointer. cast<InvokeInst>(OldCall)->setCalledFunction( Constant::getNullValue(CalleeF->getType())); - return 0; + return nullptr; } if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { @@ -924,7 +1170,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { if (isa<InvokeInst>(CS.getInstruction())) { // Can't remove an invoke because we cannot change the CFG. - return 0; + return nullptr; } // This instruction is not reachable, just remove it. We insert a store to @@ -949,7 +1195,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { for (CallSite::arg_iterator I = CS.arg_begin() + FTy->getNumParams(), E = CS.arg_end(); I != E; ++I, ++ix) { CastInst *CI = dyn_cast<CastInst>(*I); - if (CI && isSafeToEliminateVarargsCast(CS, CI, TD, ix)) { + if (CI && isSafeToEliminateVarargsCast(CS, CI, DL, ix)) { *I = CI->getOperand(0); Changed = true; } @@ -966,13 +1212,13 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { // this. None of these calls are seen as possibly dead so go ahead and // delete the instruction now. if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) { - Instruction *I = tryOptimizeCall(CI, TD); + Instruction *I = tryOptimizeCall(CI, DL); // If we changed something return the result, etc. Otherwise let // the fallthrough check. if (I) return EraseInstFromFunction(*I); } - return Changed ? CS.getInstruction() : 0; + return Changed ? CS.getInstruction() : nullptr; } // transformConstExprCastCall - If the callee is a constexpr cast of a function, @@ -981,7 +1227,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { bool InstCombiner::transformConstExprCastCall(CallSite CS) { Function *Callee = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts()); - if (Callee == 0) + if (!Callee) return false; Instruction *Caller = CS.getInstruction(); const AttributeSet &CallerPAL = CS.getAttributes(); @@ -994,11 +1240,12 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { Type *OldRetTy = Caller->getType(); Type *NewRetTy = FT->getReturnType(); - if (NewRetTy->isStructTy()) - return false; // TODO: Handle multiple return values. - // Check to see if we are changing the return type... if (OldRetTy != NewRetTy) { + + if (NewRetTy->isStructTy()) + return false; // TODO: Handle multiple return values. + if (!CastInst::isBitCastable(NewRetTy, OldRetTy)) { if (Callee->isDeclaration()) return false; // Cannot transform this return value. @@ -1024,9 +1271,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { // the critical edge). Bail out in this case. if (!Caller->use_empty()) if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) - for (Value::use_iterator UI = II->use_begin(), E = II->use_end(); - UI != E; ++UI) - if (PHINode *PN = dyn_cast<PHINode>(*UI)) + for (User *U : II->users()) + if (PHINode *PN = dyn_cast<PHINode>(U)) if (PN->getParent() == II->getNormalDest() || PN->getParent() == II->getUnwindDest()) return false; @@ -1048,18 +1294,21 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { typeIncompatible(ParamTy, i + 1), i + 1)) return false; // Attribute not compatible with transformed value. + if (CS.isInAllocaArgument(i)) + return false; // Cannot transform to and from inalloca. + // If the parameter is passed as a byval argument, then we have to have a // sized type and the sized type has to have the same size as the old type. if (ParamTy != ActTy && CallerPAL.getParamAttributes(i + 1).hasAttribute(i + 1, Attribute::ByVal)) { PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy); - if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || TD == 0) + if (!ParamPTy || !ParamPTy->getElementType()->isSized() || !DL) return false; Type *CurElTy = ActTy->getPointerElementType(); - if (TD->getTypeAllocSize(CurElTy) != - TD->getTypeAllocSize(ParamPTy->getElementType())) + if (DL->getTypeAllocSize(CurElTy) != + DL->getTypeAllocSize(ParamPTy->getElementType())) return false; } } @@ -1223,6 +1472,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { if (!Caller->use_empty()) ReplaceInstUsesWith(*Caller, NV); + else if (Caller->hasValueHandle()) + ValueHandleBase::ValueIsRAUWd(Caller, NV); EraseInstFromFunction(*Caller); return true; @@ -1243,7 +1494,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, // If the call already has the 'nest' attribute somewhere then give up - // otherwise 'nest' would occur twice after splicing in the chain. if (Attrs.hasAttrSomewhere(Attribute::Nest)) - return 0; + return nullptr; assert(Tramp && "transformCallThroughTrampoline called with incorrect CallSite."); @@ -1255,7 +1506,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, const AttributeSet &NestAttrs = NestF->getAttributes(); if (!NestAttrs.isEmpty()) { unsigned NestIdx = 1; - Type *NestTy = 0; + Type *NestTy = nullptr; AttributeSet NestAttr; // Look for a parameter marked with the 'nest' attribute. diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 72377dc..b9c3d0f 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -14,11 +14,13 @@ #include "InstCombine.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/IR/DataLayout.h" -#include "llvm/Support/PatternMatch.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Target/TargetLibraryInfo.h" using namespace llvm; using namespace PatternMatch; +#define DEBUG_TYPE "instcombine" + /// DecomposeSimpleLinearExpr - Analyze 'Val', seeing if it is a simple linear /// expression. If so, decompose it, returning some value X, such that Val is /// X*Scale+Offset. @@ -79,7 +81,7 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale, Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI) { // This requires DataLayout to get the alloca alignment and size information. - if (!TD) return 0; + if (!DL) return nullptr; PointerType *PTy = cast<PointerType>(CI.getType()); @@ -89,26 +91,26 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, // Get the type really allocated and the type casted to. Type *AllocElTy = AI.getAllocatedType(); Type *CastElTy = PTy->getElementType(); - if (!AllocElTy->isSized() || !CastElTy->isSized()) return 0; + if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr; - unsigned AllocElTyAlign = TD->getABITypeAlignment(AllocElTy); - unsigned CastElTyAlign = TD->getABITypeAlignment(CastElTy); - if (CastElTyAlign < AllocElTyAlign) return 0; + unsigned AllocElTyAlign = DL->getABITypeAlignment(AllocElTy); + unsigned CastElTyAlign = DL->getABITypeAlignment(CastElTy); + if (CastElTyAlign < AllocElTyAlign) return nullptr; // If the allocation has multiple uses, only promote it if we are strictly // increasing the alignment of the resultant allocation. If we keep it the // same, we open the door to infinite loops of various kinds. - if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return 0; + if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return nullptr; - uint64_t AllocElTySize = TD->getTypeAllocSize(AllocElTy); - uint64_t CastElTySize = TD->getTypeAllocSize(CastElTy); - if (CastElTySize == 0 || AllocElTySize == 0) return 0; + uint64_t AllocElTySize = DL->getTypeAllocSize(AllocElTy); + uint64_t CastElTySize = DL->getTypeAllocSize(CastElTy); + if (CastElTySize == 0 || AllocElTySize == 0) return nullptr; // If the allocation has multiple uses, only promote it if we're not // shrinking the amount of memory being allocated. - uint64_t AllocElTyStoreSize = TD->getTypeStoreSize(AllocElTy); - uint64_t CastElTyStoreSize = TD->getTypeStoreSize(CastElTy); - if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return 0; + uint64_t AllocElTyStoreSize = DL->getTypeStoreSize(AllocElTy); + uint64_t CastElTyStoreSize = DL->getTypeStoreSize(CastElTy); + if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return nullptr; // See if we can satisfy the modulus by pulling a scale out of the array // size argument. @@ -120,10 +122,10 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, // If we can now satisfy the modulus, by using a non-1 scale, we really can // do the xform. if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 || - (AllocElTySize*ArrayOffset ) % CastElTySize != 0) return 0; + (AllocElTySize*ArrayOffset ) % CastElTySize != 0) return nullptr; unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize; - Value *Amt = 0; + Value *Amt = nullptr; if (Scale == 1) { Amt = NumElements; } else { @@ -141,6 +143,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, AllocaInst *New = AllocaBuilder.CreateAlloca(CastElTy, Amt); New->setAlignment(AI.getAlignment()); New->takeName(&AI); + New->setUsedWithInAlloca(AI.isUsedWithInAlloca()); // If the allocation has multiple real uses, insert a cast and change all // things that used it to use the new cast. This will also hack on CI, but it @@ -161,15 +164,15 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned) { if (Constant *C = dyn_cast<Constant>(V)) { C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/); - // If we got a constantexpr back, try to simplify it with TD info. + // If we got a constantexpr back, try to simplify it with DL info. if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) - C = ConstantFoldConstantExpression(CE, TD, TLI); + C = ConstantFoldConstantExpression(CE, DL, TLI); return C; } // Otherwise, it must be an instruction. Instruction *I = cast<Instruction>(V); - Instruction *Res = 0; + Instruction *Res = nullptr; unsigned Opc = I->getOpcode(); switch (Opc) { case Instruction::Add: @@ -235,7 +238,7 @@ isEliminableCastPair( const CastInst *CI, ///< The first cast instruction unsigned opcode, ///< The opcode of the second cast instruction Type *DstTy, ///< The target type for the second cast instruction - DataLayout *TD ///< The target data for pointer size + const DataLayout *DL ///< The target data for pointer size ) { Type *SrcTy = CI->getOperand(0)->getType(); // A from above @@ -244,12 +247,12 @@ isEliminableCastPair( // Get the opcodes of the two Cast instructions Instruction::CastOps firstOp = Instruction::CastOps(CI->getOpcode()); Instruction::CastOps secondOp = Instruction::CastOps(opcode); - Type *SrcIntPtrTy = TD && SrcTy->isPtrOrPtrVectorTy() ? - TD->getIntPtrType(SrcTy) : 0; - Type *MidIntPtrTy = TD && MidTy->isPtrOrPtrVectorTy() ? - TD->getIntPtrType(MidTy) : 0; - Type *DstIntPtrTy = TD && DstTy->isPtrOrPtrVectorTy() ? - TD->getIntPtrType(DstTy) : 0; + Type *SrcIntPtrTy = DL && SrcTy->isPtrOrPtrVectorTy() ? + DL->getIntPtrType(SrcTy) : nullptr; + Type *MidIntPtrTy = DL && MidTy->isPtrOrPtrVectorTy() ? + DL->getIntPtrType(MidTy) : nullptr; + Type *DstIntPtrTy = DL && DstTy->isPtrOrPtrVectorTy() ? + DL->getIntPtrType(DstTy) : nullptr; unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy, DstTy, SrcIntPtrTy, MidIntPtrTy, DstIntPtrTy); @@ -275,7 +278,7 @@ bool InstCombiner::ShouldOptimizeCast(Instruction::CastOps opc, const Value *V, // If this is another cast that can be eliminated, we prefer to have it // eliminated. if (const CastInst *CI = dyn_cast<CastInst>(V)) - if (isEliminableCastPair(CI, opc, Ty, TD)) + if (isEliminableCastPair(CI, opc, Ty, DL)) return false; // If this is a vector sext from a compare, then we don't want to break the @@ -295,7 +298,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) { // eliminate it now. if (CastInst *CSrc = dyn_cast<CastInst>(Src)) { // A->B->C cast if (Instruction::CastOps opc = - isEliminableCastPair(CSrc, CI.getOpcode(), CI.getType(), TD)) { + isEliminableCastPair(CSrc, CI.getOpcode(), CI.getType(), DL)) { // The first cast (CSrc) is eliminable so we need to fix up or replace // the second cast (CI). CSrc will then have a good chance of being dead. return CastInst::Create(opc, CSrc->getOperand(0), CI.getType()); @@ -318,7 +321,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) { return NV; } - return 0; + return nullptr; } /// CanEvaluateTruncated - Return true if we can evaluate the specified @@ -470,7 +473,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { } // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion. - Value *A = 0; ConstantInt *Cst = 0; + Value *A = nullptr; ConstantInt *Cst = nullptr; if (Src->hasOneUse() && match(Src, m_LShr(m_ZExt(m_Value(A)), m_ConstantInt(Cst)))) { // We have three types to worry about here, the type of A, the source of @@ -502,7 +505,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { ConstantExpr::getTrunc(Cst, CI.getType())); } - return 0; + return nullptr; } /// transformZExtICmp - Transform (zext icmp) to bitwise / integer operations @@ -550,7 +553,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI, // If Op1C some other power of two, convert: uint32_t BitWidth = Op1C->getType()->getBitWidth(); APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - ComputeMaskedBits(ICI->getOperand(0), KnownZero, KnownOne); + computeKnownBits(ICI->getOperand(0), KnownZero, KnownOne); APInt KnownZeroMask(~KnownZero); if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1? @@ -598,8 +601,8 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI, APInt KnownZeroLHS(BitWidth, 0), KnownOneLHS(BitWidth, 0); APInt KnownZeroRHS(BitWidth, 0), KnownOneRHS(BitWidth, 0); - ComputeMaskedBits(LHS, KnownZeroLHS, KnownOneLHS); - ComputeMaskedBits(RHS, KnownZeroRHS, KnownOneRHS); + computeKnownBits(LHS, KnownZeroLHS, KnownOneLHS); + computeKnownBits(RHS, KnownZeroRHS, KnownOneRHS); if (KnownZeroLHS == KnownZeroRHS && KnownOneLHS == KnownOneRHS) { APInt KnownBits = KnownZeroLHS | KnownOneLHS; @@ -627,7 +630,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI, } } - return 0; + return nullptr; } /// CanEvaluateZExtd - Determine if the specified value can be computed in the @@ -757,8 +760,8 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) { Instruction *InstCombiner::visitZExt(ZExtInst &CI) { // If this zero extend is only used by a truncate, let the truncate be // eliminated before we try to optimize this zext. - if (CI.hasOneUse() && isa<TruncInst>(CI.use_back())) - return 0; + if (CI.hasOneUse() && isa<TruncInst>(CI.user_back())) + return nullptr; // If one of the common conversion will work, do it. if (Instruction *Result = commonCastTransforms(CI)) @@ -858,42 +861,32 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) { } } - // zext(trunc(t) & C) -> (t & zext(C)). - if (SrcI && SrcI->getOpcode() == Instruction::And && SrcI->hasOneUse()) - if (ConstantInt *C = dyn_cast<ConstantInt>(SrcI->getOperand(1))) - if (TruncInst *TI = dyn_cast<TruncInst>(SrcI->getOperand(0))) { - Value *TI0 = TI->getOperand(0); - if (TI0->getType() == CI.getType()) - return - BinaryOperator::CreateAnd(TI0, - ConstantExpr::getZExt(C, CI.getType())); - } - - // zext((trunc(t) & C) ^ C) -> ((t & zext(C)) ^ zext(C)). - if (SrcI && SrcI->getOpcode() == Instruction::Xor && SrcI->hasOneUse()) - if (ConstantInt *C = dyn_cast<ConstantInt>(SrcI->getOperand(1))) - if (BinaryOperator *And = dyn_cast<BinaryOperator>(SrcI->getOperand(0))) - if (And->getOpcode() == Instruction::And && And->hasOneUse() && - And->getOperand(1) == C) - if (TruncInst *TI = dyn_cast<TruncInst>(And->getOperand(0))) { - Value *TI0 = TI->getOperand(0); - if (TI0->getType() == CI.getType()) { - Constant *ZC = ConstantExpr::getZExt(C, CI.getType()); - Value *NewAnd = Builder->CreateAnd(TI0, ZC); - return BinaryOperator::CreateXor(NewAnd, ZC); - } - } + // zext(trunc(X) & C) -> (X & zext(C)). + Constant *C; + Value *X; + if (SrcI && + match(SrcI, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Constant(C)))) && + X->getType() == CI.getType()) + return BinaryOperator::CreateAnd(X, ConstantExpr::getZExt(C, CI.getType())); + + // zext((trunc(X) & C) ^ C) -> ((X & zext(C)) ^ zext(C)). + Value *And; + if (SrcI && match(SrcI, m_OneUse(m_Xor(m_Value(And), m_Constant(C)))) && + match(And, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Specific(C)))) && + X->getType() == CI.getType()) { + Constant *ZC = ConstantExpr::getZExt(C, CI.getType()); + return BinaryOperator::CreateXor(Builder->CreateAnd(X, ZC), ZC); + } // zext (xor i1 X, true) to i32 --> xor (zext i1 X to i32), 1 - Value *X; - if (SrcI && SrcI->hasOneUse() && SrcI->getType()->isIntegerTy(1) && - match(SrcI, m_Not(m_Value(X))) && - (!X->hasOneUse() || !isa<CmpInst>(X))) { + if (SrcI && SrcI->hasOneUse() && + SrcI->getType()->getScalarType()->isIntegerTy(1) && + match(SrcI, m_Not(m_Value(X))) && (!X->hasOneUse() || !isa<CmpInst>(X))) { Value *New = Builder->CreateZExt(X, CI.getType()); return BinaryOperator::CreateXor(New, ConstantInt::get(CI.getType(), 1)); } - return 0; + return nullptr; } /// transformSExtICmp - Transform (sext icmp) to bitwise / integer operations @@ -902,10 +895,10 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1); ICmpInst::Predicate Pred = ICI->getPredicate(); - if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) { + if (Constant *Op1C = dyn_cast<Constant>(Op1)) { // (x <s 0) ? -1 : 0 -> ashr x, 31 -> all ones if negative // (x >s -1) ? -1 : 0 -> not (ashr x, 31) -> all ones if positive - if ((Pred == ICmpInst::ICMP_SLT && Op1C->isZero()) || + if ((Pred == ICmpInst::ICMP_SLT && Op1C->isNullValue()) || (Pred == ICmpInst::ICMP_SGT && Op1C->isAllOnesValue())) { Value *Sh = ConstantInt::get(Op0->getType(), @@ -918,7 +911,9 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { In = Builder->CreateNot(In, In->getName()+".not"); return ReplaceInstUsesWith(CI, In); } + } + if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) { // If we know that only one bit of the LHS of the icmp can be set and we // have an equality comparison with zero or a power of 2, we can transform // the icmp and sext into bitwise/integer operations. @@ -926,7 +921,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){ unsigned BitWidth = Op1C->getType()->getBitWidth(); APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - ComputeMaskedBits(Op0, KnownZero, KnownOne); + computeKnownBits(Op0, KnownZero, KnownOne); APInt KnownZeroMask(~KnownZero); if (KnownZeroMask.isPowerOf2()) { @@ -975,20 +970,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { } } - // vector (x <s 0) ? -1 : 0 -> ashr x, 31 -> all ones if signed. - if (VectorType *VTy = dyn_cast<VectorType>(CI.getType())) { - if (Pred == ICmpInst::ICMP_SLT && match(Op1, m_Zero()) && - Op0->getType() == CI.getType()) { - Type *EltTy = VTy->getElementType(); - - // splat the shift constant to a constant vector. - Constant *VSh = ConstantInt::get(VTy, EltTy->getScalarSizeInBits()-1); - Value *In = Builder->CreateAShr(Op0, VSh, Op0->getName()+".lobit"); - return ReplaceInstUsesWith(CI, In); - } - } - - return 0; + return nullptr; } /// CanEvaluateSExtd - Return true if we can take the specified value @@ -1059,8 +1041,8 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) { Instruction *InstCombiner::visitSExt(SExtInst &CI) { // If this sign extend is only used by a truncate, let the truncate be // eliminated before we try to optimize this sext. - if (CI.hasOneUse() && isa<TruncInst>(CI.use_back())) - return 0; + if (CI.hasOneUse() && isa<TruncInst>(CI.user_back())) + return nullptr; if (Instruction *I = commonCastTransforms(CI)) return I; @@ -1128,9 +1110,9 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) { // into: // %a = shl i32 %i, 30 // %d = ashr i32 %a, 30 - Value *A = 0; + Value *A = nullptr; // TODO: Eventually this could be subsumed by EvaluateInDifferentType. - ConstantInt *BA = 0, *CA = 0; + ConstantInt *BA = nullptr, *CA = nullptr; if (match(Src, m_AShr(m_Shl(m_Trunc(m_Value(A)), m_ConstantInt(BA)), m_ConstantInt(CA))) && BA == CA && A->getType() == CI.getType()) { @@ -1142,7 +1124,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) { return BinaryOperator::CreateAShr(A, ShAmtV); } - return 0; + return nullptr; } @@ -1154,7 +1136,7 @@ static Constant *FitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo); if (!losesInfo) return ConstantFP::get(CFP->getContext(), F); - return 0; + return nullptr; } /// LookThroughFPExtensions - If this is an fp extension instruction, look @@ -1189,43 +1171,112 @@ static Value *LookThroughFPExtensions(Value *V) { Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { if (Instruction *I = commonCastTransforms(CI)) return I; - - // If we have fptrunc(fadd (fpextend x), (fpextend y)), where x and y are - // smaller than the destination type, we can eliminate the truncate by doing - // the add as the smaller type. This applies to fadd/fsub/fmul/fdiv as well - // as many builtins (sqrt, etc). + // If we have fptrunc(OpI (fpextend x), (fpextend y)), we would like to + // simpilify this expression to avoid one or more of the trunc/extend + // operations if we can do so without changing the numerical results. + // + // The exact manner in which the widths of the operands interact to limit + // what we can and cannot do safely varies from operation to operation, and + // is explained below in the various case statements. BinaryOperator *OpI = dyn_cast<BinaryOperator>(CI.getOperand(0)); if (OpI && OpI->hasOneUse()) { + Value *LHSOrig = LookThroughFPExtensions(OpI->getOperand(0)); + Value *RHSOrig = LookThroughFPExtensions(OpI->getOperand(1)); + unsigned OpWidth = OpI->getType()->getFPMantissaWidth(); + unsigned LHSWidth = LHSOrig->getType()->getFPMantissaWidth(); + unsigned RHSWidth = RHSOrig->getType()->getFPMantissaWidth(); + unsigned SrcWidth = std::max(LHSWidth, RHSWidth); + unsigned DstWidth = CI.getType()->getFPMantissaWidth(); switch (OpI->getOpcode()) { - default: break; - case Instruction::FAdd: - case Instruction::FSub: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - Type *SrcTy = OpI->getType(); - Value *LHSTrunc = LookThroughFPExtensions(OpI->getOperand(0)); - Value *RHSTrunc = LookThroughFPExtensions(OpI->getOperand(1)); - if (LHSTrunc->getType() != SrcTy && - RHSTrunc->getType() != SrcTy) { - unsigned DstSize = CI.getType()->getScalarSizeInBits(); - // If the source types were both smaller than the destination type of - // the cast, do this xform. - if (LHSTrunc->getType()->getScalarSizeInBits() <= DstSize && - RHSTrunc->getType()->getScalarSizeInBits() <= DstSize) { - LHSTrunc = Builder->CreateFPExt(LHSTrunc, CI.getType()); - RHSTrunc = Builder->CreateFPExt(RHSTrunc, CI.getType()); - return BinaryOperator::Create(OpI->getOpcode(), LHSTrunc, RHSTrunc); + default: break; + case Instruction::FAdd: + case Instruction::FSub: + // For addition and subtraction, the infinitely precise result can + // essentially be arbitrarily wide; proving that double rounding + // will not occur because the result of OpI is exact (as we will for + // FMul, for example) is hopeless. However, we *can* nonetheless + // frequently know that double rounding cannot occur (or that it is + // innocuous) by taking advantage of the specific structure of + // infinitely-precise results that admit double rounding. + // + // Specifically, if OpWidth >= 2*DstWdith+1 and DstWidth is sufficient + // to represent both sources, we can guarantee that the double + // rounding is innocuous (See p50 of Figueroa's 2000 PhD thesis, + // "A Rigorous Framework for Fully Supporting the IEEE Standard ..." + // for proof of this fact). + // + // Note: Figueroa does not consider the case where DstFormat != + // SrcFormat. It's possible (likely even!) that this analysis + // could be tightened for those cases, but they are rare (the main + // case of interest here is (float)((double)float + float)). + if (OpWidth >= 2*DstWidth+1 && DstWidth >= SrcWidth) { + if (LHSOrig->getType() != CI.getType()) + LHSOrig = Builder->CreateFPExt(LHSOrig, CI.getType()); + if (RHSOrig->getType() != CI.getType()) + RHSOrig = Builder->CreateFPExt(RHSOrig, CI.getType()); + Instruction *RI = + BinaryOperator::Create(OpI->getOpcode(), LHSOrig, RHSOrig); + RI->copyFastMathFlags(OpI); + return RI; } - } - break; + break; + case Instruction::FMul: + // For multiplication, the infinitely precise result has at most + // LHSWidth + RHSWidth significant bits; if OpWidth is sufficient + // that such a value can be exactly represented, then no double + // rounding can possibly occur; we can safely perform the operation + // in the destination format if it can represent both sources. + if (OpWidth >= LHSWidth + RHSWidth && DstWidth >= SrcWidth) { + if (LHSOrig->getType() != CI.getType()) + LHSOrig = Builder->CreateFPExt(LHSOrig, CI.getType()); + if (RHSOrig->getType() != CI.getType()) + RHSOrig = Builder->CreateFPExt(RHSOrig, CI.getType()); + Instruction *RI = + BinaryOperator::CreateFMul(LHSOrig, RHSOrig); + RI->copyFastMathFlags(OpI); + return RI; + } + break; + case Instruction::FDiv: + // For division, we use again use the bound from Figueroa's + // dissertation. I am entirely certain that this bound can be + // tightened in the unbalanced operand case by an analysis based on + // the diophantine rational approximation bound, but the well-known + // condition used here is a good conservative first pass. + // TODO: Tighten bound via rigorous analysis of the unbalanced case. + if (OpWidth >= 2*DstWidth && DstWidth >= SrcWidth) { + if (LHSOrig->getType() != CI.getType()) + LHSOrig = Builder->CreateFPExt(LHSOrig, CI.getType()); + if (RHSOrig->getType() != CI.getType()) + RHSOrig = Builder->CreateFPExt(RHSOrig, CI.getType()); + Instruction *RI = + BinaryOperator::CreateFDiv(LHSOrig, RHSOrig); + RI->copyFastMathFlags(OpI); + return RI; + } + break; + case Instruction::FRem: + // Remainder is straightforward. Remainder is always exact, so the + // type of OpI doesn't enter into things at all. We simply evaluate + // in whichever source type is larger, then convert to the + // destination type. + if (LHSWidth < SrcWidth) + LHSOrig = Builder->CreateFPExt(LHSOrig, RHSOrig->getType()); + else if (RHSWidth <= SrcWidth) + RHSOrig = Builder->CreateFPExt(RHSOrig, LHSOrig->getType()); + Value *ExactResult = Builder->CreateFRem(LHSOrig, RHSOrig); + if (Instruction *RI = dyn_cast<Instruction>(ExactResult)) + RI->copyFastMathFlags(OpI); + return CastInst::CreateFPCast(ExactResult, CI.getType()); } // (fptrunc (fneg x)) -> (fneg (fptrunc x)) if (BinaryOperator::isFNeg(OpI)) { Value *InnerTrunc = Builder->CreateFPTrunc(OpI->getOperand(1), CI.getType()); - return BinaryOperator::CreateFNeg(InnerTrunc); + Instruction *RI = BinaryOperator::CreateFNeg(InnerTrunc); + RI->copyFastMathFlags(OpI); + return RI; } } @@ -1297,7 +1348,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { } } - return 0; + return nullptr; } Instruction *InstCombiner::visitFPExt(CastInst &CI) { @@ -1306,7 +1357,7 @@ Instruction *InstCombiner::visitFPExt(CastInst &CI) { Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) { Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0)); - if (OpI == 0) + if (!OpI) return commonCastTransforms(FI); // fptoui(uitofp(X)) --> X @@ -1326,7 +1377,7 @@ Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) { Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) { Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0)); - if (OpI == 0) + if (!OpI) return commonCastTransforms(FI); // fptosi(sitofp(X)) --> X @@ -1357,11 +1408,11 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) { // trunc or zext to the intptr_t type, then inttoptr of it. This allows the // cast to be exposed to other transforms. - if (TD) { + if (DL) { unsigned AS = CI.getAddressSpace(); if (CI.getOperand(0)->getType()->getScalarSizeInBits() != - TD->getPointerSizeInBits(AS)) { - Type *Ty = TD->getIntPtrType(CI.getContext(), AS); + DL->getPointerSizeInBits(AS)) { + Type *Ty = DL->getIntPtrType(CI.getContext(), AS); if (CI.getType()->isVectorTy()) // Handle vectors of pointers. Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements()); @@ -1373,7 +1424,7 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) { if (Instruction *I = commonCastTransforms(CI)) return I; - return 0; + return nullptr; } /// @brief Implement the transforms for cast of pointer (bitcast/ptrtoint) @@ -1383,7 +1434,12 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) { if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) { // If casting the result of a getelementptr instruction with no offset, turn // this into a cast of the original pointer! - if (GEP->hasAllZeroIndices()) { + if (GEP->hasAllZeroIndices() && + // If CI is an addrspacecast and GEP changes the poiner type, merging + // GEP into CI would undo canonicalizing addrspacecast with different + // pointer types, causing infinite loops. + (!isa<AddrSpaceCastInst>(CI) || + GEP->getType() == GEP->getPointerOperand()->getType())) { // Changing the cast operand is usually not a good idea but it is safe // here because the pointer operand is being replaced with another // pointer operand so the opcode doesn't need to change. @@ -1392,7 +1448,7 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) { return &CI; } - if (!TD) + if (!DL) return commonCastTransforms(CI); // If the GEP has a single use, and the base pointer is a bitcast, and the @@ -1400,12 +1456,12 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) { // instructions into fewer. This typically happens with unions and other // non-type-safe code. unsigned AS = GEP->getPointerAddressSpace(); - unsigned OffsetBits = TD->getPointerSizeInBits(AS); + unsigned OffsetBits = DL->getPointerSizeInBits(AS); APInt Offset(OffsetBits, 0); BitCastInst *BCI = dyn_cast<BitCastInst>(GEP->getOperand(0)); if (GEP->hasOneUse() && BCI && - GEP->accumulateConstantOffset(*TD, Offset)) { + GEP->accumulateConstantOffset(*DL, Offset)) { // Get the base pointer input of the bitcast, and the type it points to. Value *OrigBase = BCI->getOperand(0); SmallVector<Value*, 8> NewIndices; @@ -1436,16 +1492,16 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) { // do a ptrtoint to intptr_t then do a trunc or zext. This allows the cast // to be exposed to other transforms. - if (!TD) + if (!DL) return commonPointerCastTransforms(CI); Type *Ty = CI.getType(); unsigned AS = CI.getPointerAddressSpace(); - if (Ty->getScalarSizeInBits() == TD->getPointerSizeInBits(AS)) + if (Ty->getScalarSizeInBits() == DL->getPointerSizeInBits(AS)) return commonPointerCastTransforms(CI); - Type *PtrTy = TD->getIntPtrType(CI.getContext(), AS); + Type *PtrTy = DL->getIntPtrType(CI.getContext(), AS); if (Ty->isVectorTy()) // Handle vectors of pointers. PtrTy = VectorType::get(PtrTy, Ty->getVectorNumElements()); @@ -1472,7 +1528,7 @@ static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy, // there yet. if (SrcTy->getElementType()->getPrimitiveSizeInBits() != DestTy->getElementType()->getPrimitiveSizeInBits()) - return 0; + return nullptr; SrcTy = VectorType::get(DestTy->getElementType(), SrcTy->getNumElements()); InVal = IC.Builder->CreateBitCast(InVal, SrcTy); @@ -1550,7 +1606,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, ElementIndex = Elements.size() - ElementIndex - 1; // Fail if multiple elements are inserted into this slot. - if (Elements[ElementIndex] != 0) + if (Elements[ElementIndex]) return false; Elements[ElementIndex] = V; @@ -1590,7 +1646,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, if (!V->hasOneUse()) return false; Instruction *I = dyn_cast<Instruction>(V); - if (I == 0) return false; + if (!I) return false; switch (I->getOpcode()) { default: return false; // Unhandled case. case Instruction::BitCast: @@ -1611,7 +1667,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, case Instruction::Shl: { // Must be shifting by a constant that is a multiple of the element size. ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1)); - if (CI == 0) return false; + if (!CI) return false; Shift += CI->getZExtValue(); if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false; return CollectInsertionElements(I->getOperand(0), Shift, @@ -1639,7 +1695,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI, InstCombiner &IC) { // We need to know the target byte order to perform this optimization. - if (!IC.getDataLayout()) return 0; + if (!IC.getDataLayout()) return nullptr; VectorType *DestVecTy = cast<VectorType>(CI.getType()); Value *IntInput = CI.getOperand(0); @@ -1647,14 +1703,14 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI, SmallVector<Value*, 8> Elements(DestVecTy->getNumElements()); if (!CollectInsertionElements(IntInput, 0, Elements, DestVecTy->getElementType(), IC)) - return 0; + return nullptr; // If we succeeded, we know that all of the element are specified by Elements // or are zero if Elements has a null entry. Recast this as a set of // insertions. Value *Result = Constant::getNullValue(CI.getType()); for (unsigned i = 0, e = Elements.size(); i != e; ++i) { - if (Elements[i] == 0) continue; // Unset element. + if (!Elements[i]) continue; // Unset element. Result = IC.Builder->CreateInsertElement(Result, Elements[i], IC.Builder->getInt32(i)); @@ -1668,14 +1724,14 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI, /// bitcast. The various long double bitcasts can't get in here. static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){ // We need to know the target byte order to perform this optimization. - if (!IC.getDataLayout()) return 0; + if (!IC.getDataLayout()) return nullptr; Value *Src = CI.getOperand(0); Type *DestTy = CI.getType(); // If this is a bitcast from int to float, check to see if the int is an // extraction from a vector. - Value *VecInput = 0; + Value *VecInput = nullptr; // bitcast(trunc(bitcast(somevector))) if (match(Src, m_Trunc(m_BitCast(m_Value(VecInput)))) && isa<VectorType>(VecInput->getType())) { @@ -1699,7 +1755,7 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){ } // bitcast(trunc(lshr(bitcast(somevector), cst)) - ConstantInt *ShAmt = 0; + ConstantInt *ShAmt = nullptr; if (match(Src, m_Trunc(m_LShr(m_BitCast(m_Value(VecInput)), m_ConstantInt(ShAmt)))) && isa<VectorType>(VecInput->getType())) { @@ -1721,7 +1777,7 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){ return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt)); } } - return 0; + return nullptr; } Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { @@ -1741,11 +1797,6 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { Type *DstElTy = DstPTy->getElementType(); Type *SrcElTy = SrcPTy->getElementType(); - // If the address spaces don't match, don't eliminate the bitcast, which is - // required for changing types. - if (SrcPTy->getAddressSpace() != DstPTy->getAddressSpace()) - return 0; - // If we are casting a alloca to a pointer to a type of the same // size, rewrite the allocation instruction to allocate the "right" type. // There is no need to modify malloc calls because it is their bitcast that @@ -1858,5 +1909,24 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { } Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) { - return commonCastTransforms(CI); + // If the destination pointer element type is not the same as the source's + // first do a bitcast to the destination type, and then the addrspacecast. + // This allows the cast to be exposed to other transforms. + Value *Src = CI.getOperand(0); + PointerType *SrcTy = cast<PointerType>(Src->getType()->getScalarType()); + PointerType *DestTy = cast<PointerType>(CI.getType()->getScalarType()); + + Type *DestElemTy = DestTy->getElementType(); + if (SrcTy->getElementType() != DestElemTy) { + Type *MidTy = PointerType::get(DestElemTy, SrcTy->getAddressSpace()); + if (VectorType *VT = dyn_cast<VectorType>(CI.getType())) { + // Handle vectors of pointers. + MidTy = VectorType::get(MidTy, VT->getNumElements()); + } + + Value *NewBitCast = Builder->CreateBitCast(Src, MidTy); + return new AddrSpaceCastInst(NewBitCast, CI.getType()); + } + + return commonPointerCastTransforms(CI); } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 9bb65ef..5e71c5c 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -15,28 +15,21 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/IR/ConstantRange.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/Support/ConstantRange.h" -#include "llvm/Support/GetElementPtrTypeIterator.h" -#include "llvm/Support/PatternMatch.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Target/TargetLibraryInfo.h" using namespace llvm; using namespace PatternMatch; +#define DEBUG_TYPE "instcombine" + static ConstantInt *getOne(Constant *C) { return ConstantInt::get(cast<IntegerType>(C->getType()), 1); } -/// AddOne - Add one to a ConstantInt -static Constant *AddOne(Constant *C) { - return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1)); -} -/// SubOne - Subtract one from a ConstantInt -static Constant *SubOne(Constant *C) { - return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1)); -} - static ConstantInt *ExtractElement(Constant *V, Constant *Idx) { return cast<ConstantInt>(ConstantExpr::getExtractElement(V, Idx)); } @@ -227,15 +220,15 @@ Instruction *InstCombiner:: FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, CmpInst &ICI, ConstantInt *AndCst) { // We need TD information to know the pointer size unless this is inbounds. - if (!GEP->isInBounds() && TD == 0) - return 0; + if (!GEP->isInBounds() && !DL) + return nullptr; Constant *Init = GV->getInitializer(); if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init)) - return 0; + return nullptr; uint64_t ArrayElementCount = Init->getType()->getArrayNumElements(); - if (ArrayElementCount > 1024) return 0; // Don't blow up on huge arrays. + if (ArrayElementCount > 1024) return nullptr; // Don't blow up on huge arrays. // There are many forms of this optimization we can handle, for now, just do // the simple index into a single-dimensional array. @@ -245,7 +238,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, !isa<ConstantInt>(GEP->getOperand(1)) || !cast<ConstantInt>(GEP->getOperand(1))->isZero() || isa<Constant>(GEP->getOperand(2))) - return 0; + return nullptr; // Check that indices after the variable are constants and in-range for the // type they index. Collect the indices. This is typically for arrays of @@ -255,18 +248,18 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, Type *EltTy = Init->getType()->getArrayElementType(); for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) { ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i)); - if (Idx == 0) return 0; // Variable index. + if (!Idx) return nullptr; // Variable index. uint64_t IdxVal = Idx->getZExtValue(); - if ((unsigned)IdxVal != IdxVal) return 0; // Too large array index. + if ((unsigned)IdxVal != IdxVal) return nullptr; // Too large array index. if (StructType *STy = dyn_cast<StructType>(EltTy)) EltTy = STy->getElementType(IdxVal); else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) { - if (IdxVal >= ATy->getNumElements()) return 0; + if (IdxVal >= ATy->getNumElements()) return nullptr; EltTy = ATy->getElementType(); } else { - return 0; // Unknown type. + return nullptr; // Unknown type. } LaterIndices.push_back(IdxVal); @@ -305,7 +298,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, Constant *CompareRHS = cast<Constant>(ICI.getOperand(1)); for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) { Constant *Elt = Init->getAggregateElement(i); - if (Elt == 0) return 0; + if (!Elt) return nullptr; // If this is indexing an array of structures, get the structure element. if (!LaterIndices.empty()) @@ -316,7 +309,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, // Find out if the comparison would be true or false for the i'th element. Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt, - CompareRHS, TD, TLI); + CompareRHS, DL, TLI); // If the result is undef for this element, ignore it. if (isa<UndefValue>(C)) { // Extend range state machines to cover this element in case there is an @@ -330,7 +323,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, // If we can't compute the result for any of the elements, we have to give // up evaluating the entire conditional. - if (!isa<ConstantInt>(C)) return 0; + if (!isa<ConstantInt>(C)) return nullptr; // Otherwise, we know if the comparison is true or false for this element, // update our state machines. @@ -384,7 +377,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, if ((i & 8) == 0 && i >= 64 && SecondTrueElement == Overdefined && SecondFalseElement == Overdefined && TrueRangeEnd == Overdefined && FalseRangeEnd == Overdefined) - return 0; + return nullptr; } // Now that we've scanned the entire array, emit our new comparison(s). We @@ -395,7 +388,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, // index down like the GEP would do implicitly. We don't have to do this for // an inbounds GEP because the index can't be out of range. if (!GEP->isInBounds()) { - Type *IntPtrTy = TD->getIntPtrType(GEP->getType()); + Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); unsigned PtrSize = IntPtrTy->getIntegerBitWidth(); if (Idx->getType()->getPrimitiveSizeInBits() > PtrSize) Idx = Builder->CreateTrunc(Idx, IntPtrTy); @@ -476,7 +469,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, // of this load, replace it with computation that does: // ((magic_cst >> i) & 1) != 0 { - Type *Ty = 0; + Type *Ty = nullptr; // Look for an appropriate type: // - The type of Idx if the magic fits @@ -484,12 +477,12 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, // - Default to i32 if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth()) Ty = Idx->getType(); - else if (TD) - Ty = TD->getSmallestLegalIntType(Init->getContext(), ArrayElementCount); + else if (DL) + Ty = DL->getSmallestLegalIntType(Init->getContext(), ArrayElementCount); else if (ArrayElementCount <= 32) Ty = Type::getInt32Ty(Init->getContext()); - if (Ty != 0) { + if (Ty) { Value *V = Builder->CreateIntCast(Idx, Ty, false); V = Builder->CreateLShr(ConstantInt::get(Ty, MagicBitvector), V); V = Builder->CreateAnd(ConstantInt::get(Ty, 1), V); @@ -497,7 +490,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, } } - return 0; + return nullptr; } @@ -512,7 +505,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, /// If we can't emit an optimized form for this expression, this returns null. /// static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) { - DataLayout &TD = *IC.getDataLayout(); + const DataLayout &DL = *IC.getDataLayout(); gep_type_iterator GTI = gep_type_begin(GEP); // Check to see if this gep only has a single variable index. If so, and if @@ -529,9 +522,9 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) { // Handle a struct index, which adds its field offset to the pointer. if (StructType *STy = dyn_cast<StructType>(*GTI)) { - Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue()); + Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue()); } else { - uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()); + uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()); Offset += Size*CI->getSExtValue(); } } else { @@ -542,26 +535,26 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) { // If there are no variable indices, we must have a constant offset, just // evaluate it the general way. - if (i == e) return 0; + if (i == e) return nullptr; Value *VariableIdx = GEP->getOperand(i); // Determine the scale factor of the variable element. For example, this is // 4 if the variable index is into an array of i32. - uint64_t VariableScale = TD.getTypeAllocSize(GTI.getIndexedType()); + uint64_t VariableScale = DL.getTypeAllocSize(GTI.getIndexedType()); // Verify that there are no other variable indices. If so, emit the hard way. for (++i, ++GTI; i != e; ++i, ++GTI) { ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i)); - if (!CI) return 0; + if (!CI) return nullptr; // Compute the aggregate offset of constant indices. if (CI->isZero()) continue; // Handle a struct index, which adds its field offset to the pointer. if (StructType *STy = dyn_cast<StructType>(*GTI)) { - Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue()); + Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue()); } else { - uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()); + uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()); Offset += Size*CI->getSExtValue(); } } @@ -571,7 +564,7 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) { // Okay, we know we have a single variable index, which must be a // pointer/array/vector index. If there is no offset, life is simple, return // the index. - Type *IntPtrTy = TD.getIntPtrType(GEP->getOperand(0)->getType()); + Type *IntPtrTy = DL.getIntPtrType(GEP->getOperand(0)->getType()); unsigned IntPtrWidth = IntPtrTy->getIntegerBitWidth(); if (Offset == 0) { // Cast to intptrty in case a truncation occurs. If an extension is needed, @@ -596,7 +589,7 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) { // multiple of the variable scale. int64_t NewOffs = Offset / (int64_t)VariableScale; if (Offset != NewOffs*(int64_t)VariableScale) - return 0; + return nullptr; // Okay, we can do this evaluation. Start by converting the index to intptr. if (VariableIdx->getType() != IntPtrTy) @@ -617,14 +610,15 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, // e.g. "&foo[0] <s &foo[1]" can't be folded to "true" because "foo" could be // the maximum signed value for the pointer type. if (ICmpInst::isSigned(Cond)) - return 0; + return nullptr; - // Look through bitcasts. - if (BitCastInst *BCI = dyn_cast<BitCastInst>(RHS)) - RHS = BCI->getOperand(0); + // Look through bitcasts and addrspacecasts. We do not however want to remove + // 0 GEPs. + if (!isa<GetElementPtrInst>(RHS)) + RHS = RHS->stripPointerCasts(); Value *PtrBase = GEPLHS->getOperand(0); - if (TD && PtrBase == RHS && GEPLHS->isInBounds()) { + if (DL && PtrBase == RHS && GEPLHS->isInBounds()) { // ((gep Ptr, OFFSET) cmp Ptr) ---> (OFFSET cmp 0). // This transformation (ignoring the base and scales) is valid because we // know pointers can't overflow since the gep is inbounds. See if we can @@ -632,7 +626,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, *this); // If not, synthesize the offset the hard way. - if (Offset == 0) + if (!Offset) Offset = EmitGEPOffset(GEPLHS); return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset, Constant::getNullValue(Offset->getType())); @@ -657,43 +651,44 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, // If we're comparing GEPs with two base pointers that only differ in type // and both GEPs have only constant indices or just one use, then fold // the compare with the adjusted indices. - if (TD && GEPLHS->isInBounds() && GEPRHS->isInBounds() && + if (DL && GEPLHS->isInBounds() && GEPRHS->isInBounds() && (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) && (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) && PtrBase->stripPointerCasts() == GEPRHS->getOperand(0)->stripPointerCasts()) { + Value *LOffset = EmitGEPOffset(GEPLHS); + Value *ROffset = EmitGEPOffset(GEPRHS); + + // If we looked through an addrspacecast between different sized address + // spaces, the LHS and RHS pointers are different sized + // integers. Truncate to the smaller one. + Type *LHSIndexTy = LOffset->getType(); + Type *RHSIndexTy = ROffset->getType(); + if (LHSIndexTy != RHSIndexTy) { + if (LHSIndexTy->getPrimitiveSizeInBits() < + RHSIndexTy->getPrimitiveSizeInBits()) { + ROffset = Builder->CreateTrunc(ROffset, LHSIndexTy); + } else + LOffset = Builder->CreateTrunc(LOffset, RHSIndexTy); + } + Value *Cmp = Builder->CreateICmp(ICmpInst::getSignedPredicate(Cond), - EmitGEPOffset(GEPLHS), - EmitGEPOffset(GEPRHS)); + LOffset, ROffset); return ReplaceInstUsesWith(I, Cmp); } // Otherwise, the base pointers are different and the indices are // different, bail out. - return 0; + return nullptr; } // If one of the GEPs has all zero indices, recurse. - bool AllZeros = true; - for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i) - if (!isa<Constant>(GEPLHS->getOperand(i)) || - !cast<Constant>(GEPLHS->getOperand(i))->isNullValue()) { - AllZeros = false; - break; - } - if (AllZeros) + if (GEPLHS->hasAllZeroIndices()) return FoldGEPICmp(GEPRHS, GEPLHS->getOperand(0), ICmpInst::getSwappedPredicate(Cond), I); // If the other GEP has all zero indices, recurse. - AllZeros = true; - for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i) - if (!isa<Constant>(GEPRHS->getOperand(i)) || - !cast<Constant>(GEPRHS->getOperand(i))->isNullValue()) { - AllZeros = false; - break; - } - if (AllZeros) + if (GEPRHS->hasAllZeroIndices()) return FoldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I); bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds(); @@ -728,7 +723,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, // Only lower this if the icmp is the only user of the GEP or if we expect // the result to fold to a constant! - if (TD && + if (DL && GEPsInBounds && (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) && (isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) { @@ -738,7 +733,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R); } } - return 0; + return nullptr; } /// FoldICmpAddOpCst - Fold "icmp pred (X+CI), X". @@ -821,11 +816,11 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI, // if it finds it. bool DivIsSigned = DivI->getOpcode() == Instruction::SDiv; if (!ICI.isEquality() && DivIsSigned != ICI.isSigned()) - return 0; + return nullptr; if (DivRHS->isZero()) - return 0; // The ProdOV computation fails on divide by zero. + return nullptr; // The ProdOV computation fails on divide by zero. if (DivIsSigned && DivRHS->isAllOnesValue()) - return 0; // The overflow computation also screws up here + return nullptr; // The overflow computation also screws up here if (DivRHS->isOne()) { // This eliminates some funny cases with INT_MIN. ICI.setOperand(0, DivI->getOperand(0)); // X/1 == X. @@ -859,7 +854,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI, // overflow variable is set to 0 if it's corresponding bound variable is valid // -1 if overflowed off the bottom end, or +1 if overflowed off the top end. int LoOverflow = 0, HiOverflow = 0; - Constant *LoBound = 0, *HiBound = 0; + Constant *LoBound = nullptr, *HiBound = nullptr; if (!DivIsSigned) { // udiv // e.g. X/5 op 3 --> [15, 20) @@ -899,7 +894,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI, HiBound = cast<ConstantInt>(ConstantExpr::getNeg(RangeSize)); if (HiBound == DivRHS) { // -INTMIN = INTMIN HiOverflow = 1; // [INTMIN+1, overflow) - HiBound = 0; // e.g. X/INTMIN = 0 --> X > INTMIN + HiBound = nullptr; // e.g. X/INTMIN = 0 --> X > INTMIN } } else if (CmpRHSV.isStrictlyPositive()) { // (X / neg) op pos // e.g. X/-5 op 3 --> [-19, -14) @@ -973,20 +968,20 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr, uint32_t TypeBits = CmpRHSV.getBitWidth(); uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits); if (ShAmtVal >= TypeBits || ShAmtVal == 0) - return 0; + return nullptr; if (!ICI.isEquality()) { // If we have an unsigned comparison and an ashr, we can't simplify this. // Similarly for signed comparisons with lshr. if (ICI.isSigned() != (Shr->getOpcode() == Instruction::AShr)) - return 0; + return nullptr; // Otherwise, all lshr and most exact ashr's are equivalent to a udiv/sdiv // by a power of 2. Since we already have logic to simplify these, // transform to div and then simplify the resultant comparison. if (Shr->getOpcode() == Instruction::AShr && (!Shr->isExact() || ShAmtVal == TypeBits - 1)) - return 0; + return nullptr; // Revisit the shift (to delete it). Worklist.Add(Shr); @@ -1003,7 +998,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr, // If the builder folded the binop, just return it. BinaryOperator *TheDiv = dyn_cast<BinaryOperator>(Tmp); - if (TheDiv == 0) + if (!TheDiv) return &ICI; // Otherwise, fold this div/compare. @@ -1046,7 +1041,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr, Mask, Shr->getName()+".mask"); return new ICmpInst(ICI.getPredicate(), And, ShiftedCmpRHS); } - return 0; + return nullptr; } @@ -1065,7 +1060,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, unsigned DstBits = LHSI->getType()->getPrimitiveSizeInBits(), SrcBits = LHSI->getOperand(0)->getType()->getPrimitiveSizeInBits(); APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0); - ComputeMaskedBits(LHSI->getOperand(0), KnownZero, KnownOne); + computeKnownBits(LHSI->getOperand(0), KnownZero, KnownOne); // If all the high bits are known, we can do this xform. if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) { @@ -1078,17 +1073,17 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, } break; - case Instruction::Xor: // (icmp pred (xor X, XorCST), CI) - if (ConstantInt *XorCST = dyn_cast<ConstantInt>(LHSI->getOperand(1))) { + case Instruction::Xor: // (icmp pred (xor X, XorCst), CI) + if (ConstantInt *XorCst = dyn_cast<ConstantInt>(LHSI->getOperand(1))) { // If this is a comparison that tests the signbit (X < 0) or (x > -1), // fold the xor. if ((ICI.getPredicate() == ICmpInst::ICMP_SLT && RHSV == 0) || (ICI.getPredicate() == ICmpInst::ICMP_SGT && RHSV.isAllOnesValue())) { Value *CompareVal = LHSI->getOperand(0); - // If the sign bit of the XorCST is not set, there is no change to + // If the sign bit of the XorCst is not set, there is no change to // the operation, just stop using the Xor. - if (!XorCST->isNegative()) { + if (!XorCst->isNegative()) { ICI.setOperand(0, CompareVal); Worklist.Add(LHSI); return &ICI; @@ -1110,8 +1105,8 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, if (LHSI->hasOneUse()) { // (icmp u/s (xor A SignBit), C) -> (icmp s/u A, (xor C SignBit)) - if (!ICI.isEquality() && XorCST->getValue().isSignBit()) { - const APInt &SignBit = XorCST->getValue(); + if (!ICI.isEquality() && XorCst->getValue().isSignBit()) { + const APInt &SignBit = XorCst->getValue(); ICmpInst::Predicate Pred = ICI.isSigned() ? ICI.getUnsignedPredicate() : ICI.getSignedPredicate(); @@ -1120,8 +1115,8 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, } // (icmp u/s (xor A ~SignBit), C) -> (icmp s/u (xor C ~SignBit), A) - if (!ICI.isEquality() && XorCST->isMaxValue(true)) { - const APInt &NotSignBit = XorCST->getValue(); + if (!ICI.isEquality() && XorCst->isMaxValue(true)) { + const APInt &NotSignBit = XorCst->getValue(); ICmpInst::Predicate Pred = ICI.isSigned() ? ICI.getUnsignedPredicate() : ICI.getSignedPredicate(); @@ -1134,20 +1129,20 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, // (icmp ugt (xor X, C), ~C) -> (icmp ult X, C) // iff -C is a power of 2 if (ICI.getPredicate() == ICmpInst::ICMP_UGT && - XorCST->getValue() == ~RHSV && (RHSV + 1).isPowerOf2()) - return new ICmpInst(ICmpInst::ICMP_ULT, LHSI->getOperand(0), XorCST); + XorCst->getValue() == ~RHSV && (RHSV + 1).isPowerOf2()) + return new ICmpInst(ICmpInst::ICMP_ULT, LHSI->getOperand(0), XorCst); // (icmp ult (xor X, C), -C) -> (icmp uge X, C) // iff -C is a power of 2 if (ICI.getPredicate() == ICmpInst::ICMP_ULT && - XorCST->getValue() == -RHSV && RHSV.isPowerOf2()) - return new ICmpInst(ICmpInst::ICMP_UGE, LHSI->getOperand(0), XorCST); + XorCst->getValue() == -RHSV && RHSV.isPowerOf2()) + return new ICmpInst(ICmpInst::ICMP_UGE, LHSI->getOperand(0), XorCst); } break; - case Instruction::And: // (icmp pred (and X, AndCST), RHS) + case Instruction::And: // (icmp pred (and X, AndCst), RHS) if (LHSI->hasOneUse() && isa<ConstantInt>(LHSI->getOperand(1)) && LHSI->getOperand(0)->hasOneUse()) { - ConstantInt *AndCST = cast<ConstantInt>(LHSI->getOperand(1)); + ConstantInt *AndCst = cast<ConstantInt>(LHSI->getOperand(1)); // If the LHS is an AND of a truncating cast, we can widen the // and/compare to be the input width without changing the value @@ -1158,10 +1153,10 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, // Extending a relational comparison when we're checking the sign // bit would not work. if (ICI.isEquality() || - (!AndCST->isNegative() && RHSV.isNonNegative())) { + (!AndCst->isNegative() && RHSV.isNonNegative())) { Value *NewAnd = Builder->CreateAnd(Cast->getOperand(0), - ConstantExpr::getZExt(AndCST, Cast->getSrcTy())); + ConstantExpr::getZExt(AndCst, Cast->getSrcTy())); NewAnd->takeName(LHSI); return new ICmpInst(ICI.getPredicate(), NewAnd, ConstantExpr::getZExt(RHS, Cast->getSrcTy())); @@ -1177,7 +1172,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, if (ICI.isEquality() && RHSV.getActiveBits() <= Ty->getBitWidth()) { Value *NewAnd = Builder->CreateAnd(Cast->getOperand(0), - ConstantExpr::getTrunc(AndCST, Ty)); + ConstantExpr::getTrunc(AndCst, Ty)); NewAnd->takeName(LHSI); return new ICmpInst(ICI.getPredicate(), NewAnd, ConstantExpr::getTrunc(RHS, Ty)); @@ -1190,49 +1185,58 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, // access. BinaryOperator *Shift = dyn_cast<BinaryOperator>(LHSI->getOperand(0)); if (Shift && !Shift->isShift()) - Shift = 0; + Shift = nullptr; ConstantInt *ShAmt; - ShAmt = Shift ? dyn_cast<ConstantInt>(Shift->getOperand(1)) : 0; - Type *Ty = Shift ? Shift->getType() : 0; // Type of the shift. - Type *AndTy = AndCST->getType(); // Type of the and. - - // We can fold this as long as we can't shift unknown bits - // into the mask. This can happen with signed shift - // rights, as they sign-extend. With logical shifts, - // we must still make sure the comparison is not signed - // because we are effectively changing the - // position of the sign bit (PR17827). - // TODO: We can relax these constraints a bit more. + ShAmt = Shift ? dyn_cast<ConstantInt>(Shift->getOperand(1)) : nullptr; + + // This seemingly simple opportunity to fold away a shift turns out to + // be rather complicated. See PR17827 + // ( http://llvm.org/bugs/show_bug.cgi?id=17827 ) for details. if (ShAmt) { bool CanFold = false; unsigned ShiftOpcode = Shift->getOpcode(); if (ShiftOpcode == Instruction::AShr) { - // To test for the bad case of the signed shr, see if any - // of the bits shifted in could be tested after the mask. - uint32_t TyBits = Ty->getPrimitiveSizeInBits(); - int ShAmtVal = TyBits - ShAmt->getLimitedValue(TyBits); - - uint32_t BitWidth = AndTy->getPrimitiveSizeInBits(); - if ((APInt::getHighBitsSet(BitWidth, BitWidth-ShAmtVal) & - AndCST->getValue()) == 0) + // There may be some constraints that make this possible, + // but nothing simple has been discovered yet. + CanFold = false; + } else if (ShiftOpcode == Instruction::Shl) { + // For a left shift, we can fold if the comparison is not signed. + // We can also fold a signed comparison if the mask value and + // comparison value are not negative. These constraints may not be + // obvious, but we can prove that they are correct using an SMT + // solver. + if (!ICI.isSigned() || (!AndCst->isNegative() && !RHS->isNegative())) + CanFold = true; + } else if (ShiftOpcode == Instruction::LShr) { + // For a logical right shift, we can fold if the comparison is not + // signed. We can also fold a signed comparison if the shifted mask + // value and the shifted comparison value are not negative. + // These constraints may not be obvious, but we can prove that they + // are correct using an SMT solver. + if (!ICI.isSigned()) CanFold = true; - } else if (ShiftOpcode == Instruction::Shl || - ShiftOpcode == Instruction::LShr) { - CanFold = !ICI.isSigned(); + else { + ConstantInt *ShiftedAndCst = + cast<ConstantInt>(ConstantExpr::getShl(AndCst, ShAmt)); + ConstantInt *ShiftedRHSCst = + cast<ConstantInt>(ConstantExpr::getShl(RHS, ShAmt)); + + if (!ShiftedAndCst->isNegative() && !ShiftedRHSCst->isNegative()) + CanFold = true; + } } if (CanFold) { Constant *NewCst; - if (Shift->getOpcode() == Instruction::Shl) + if (ShiftOpcode == Instruction::Shl) NewCst = ConstantExpr::getLShr(RHS, ShAmt); else NewCst = ConstantExpr::getShl(RHS, ShAmt); // Check to see if we are shifting out any of the bits being // compared. - if (ConstantExpr::get(Shift->getOpcode(), - NewCst, ShAmt) != RHS) { + if (ConstantExpr::get(ShiftOpcode, NewCst, ShAmt) != RHS) { // If we shifted bits out, the fold is not going to work out. // As a special case, check to see if this means that the // result is always true or false now. @@ -1242,12 +1246,12 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, return ReplaceInstUsesWith(ICI, Builder->getTrue()); } else { ICI.setOperand(1, NewCst); - Constant *NewAndCST; - if (Shift->getOpcode() == Instruction::Shl) - NewAndCST = ConstantExpr::getLShr(AndCST, ShAmt); + Constant *NewAndCst; + if (ShiftOpcode == Instruction::Shl) + NewAndCst = ConstantExpr::getLShr(AndCst, ShAmt); else - NewAndCST = ConstantExpr::getShl(AndCST, ShAmt); - LHSI->setOperand(1, NewAndCST); + NewAndCst = ConstantExpr::getShl(AndCst, ShAmt); + LHSI->setOperand(1, NewAndCst); LHSI->setOperand(0, Shift->getOperand(0)); Worklist.Add(Shift); // Shift is dead. return &ICI; @@ -1264,10 +1268,10 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, // Compute C << Y. Value *NS; if (Shift->getOpcode() == Instruction::LShr) { - NS = Builder->CreateShl(AndCST, Shift->getOperand(1)); + NS = Builder->CreateShl(AndCst, Shift->getOperand(1)); } else { // Insert a logical shift. - NS = Builder->CreateLShr(AndCST, Shift->getOperand(1)); + NS = Builder->CreateLShr(AndCst, Shift->getOperand(1)); } // Compute X & (C << Y). @@ -1278,12 +1282,12 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, return &ICI; } - // Replace ((X & AndCST) > RHSV) with ((X & AndCST) != 0), if any - // bit set in (X & AndCST) will produce a result greater than RHSV. + // Replace ((X & AndCst) > RHSV) with ((X & AndCst) != 0), if any + // bit set in (X & AndCst) will produce a result greater than RHSV. if (ICI.getPredicate() == ICmpInst::ICMP_UGT) { - unsigned NTZ = AndCST->getValue().countTrailingZeros(); - if ((NTZ < AndCST->getBitWidth()) && - APInt::getOneBitSet(AndCST->getBitWidth(), NTZ).ugt(RHSV)) + unsigned NTZ = AndCst->getValue().countTrailingZeros(); + if ((NTZ < AndCst->getBitWidth()) && + APInt::getOneBitSet(AndCst->getBitWidth(), NTZ).ugt(RHSV)) return new ICmpInst(ICmpInst::ICMP_NE, LHSI, Constant::getNullValue(RHS->getType())); } @@ -1777,7 +1781,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, } } } - return 0; + return nullptr; } /// visitICmpInstWithCastAndCast - Handle icmp (cast x to y), (cast/cst). @@ -1792,9 +1796,9 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) { // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the // integer type is the same size as the pointer type. - if (TD && LHSCI->getOpcode() == Instruction::PtrToInt && - TD->getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) { - Value *RHSOp = 0; + if (DL && LHSCI->getOpcode() == Instruction::PtrToInt && + DL->getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) { + Value *RHSOp = nullptr; if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) { RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy); } else if (PtrToIntInst *RHSC = dyn_cast<PtrToIntInst>(ICI.getOperand(1))) { @@ -1812,7 +1816,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) { // Enforce this. if (LHSCI->getOpcode() != Instruction::ZExt && LHSCI->getOpcode() != Instruction::SExt) - return 0; + return nullptr; bool isSignedExt = LHSCI->getOpcode() == Instruction::SExt; bool isSignedCmp = ICI.isSigned(); @@ -1821,12 +1825,12 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) { // Not an extension from the same type? RHSCIOp = CI->getOperand(0); if (RHSCIOp->getType() != LHSCIOp->getType()) - return 0; + return nullptr; // If the signedness of the two casts doesn't agree (i.e. one is a sext // and the other is a zext), then we can't handle this. if (CI->getOpcode() != LHSCI->getOpcode()) - return 0; + return nullptr; // Deal with equality cases early. if (ICI.isEquality()) @@ -1844,7 +1848,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) { // If we aren't dealing with a constant on the RHS, exit early ConstantInt *CI = dyn_cast<ConstantInt>(ICI.getOperand(1)); if (!CI) - return 0; + return nullptr; // Compute the constant that would happen if we truncated to SrcTy then // reextended to DestTy. @@ -1873,7 +1877,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) { // by SimplifyICmpInst, so only deal with the tricky case. if (isSignedCmp || !isSignedExt) - return 0; + return nullptr; // Evaluate the comparison for LT (we invert for GT below). LE and GE cases // should have been folded away previously and not enter in here. @@ -1909,12 +1913,12 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, // In order to eliminate the add-with-constant, the compare can be its only // use. Instruction *AddWithCst = cast<Instruction>(I.getOperand(0)); - if (!AddWithCst->hasOneUse()) return 0; + if (!AddWithCst->hasOneUse()) return nullptr; // If CI2 is 2^7, 2^15, 2^31, then it might be an sadd.with.overflow. - if (!CI2->getValue().isPowerOf2()) return 0; + if (!CI2->getValue().isPowerOf2()) return nullptr; unsigned NewWidth = CI2->getValue().countTrailingZeros(); - if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31) return 0; + if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31) return nullptr; // The width of the new add formed is 1 more than the bias. ++NewWidth; @@ -1922,7 +1926,7 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, // Check to see that CI1 is an all-ones value with NewWidth bits. if (CI1->getBitWidth() == NewWidth || CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth)) - return 0; + return nullptr; // This is only really a signed overflow check if the inputs have been // sign-extended; check for that condition. For example, if CI2 is 2^31 and @@ -1930,25 +1934,24 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, unsigned NeededSignBits = CI1->getBitWidth() - NewWidth + 1; if (IC.ComputeNumSignBits(A) < NeededSignBits || IC.ComputeNumSignBits(B) < NeededSignBits) - return 0; + return nullptr; // In order to replace the original add with a narrower // llvm.sadd.with.overflow, the only uses allowed are the add-with-constant // and truncates that discard the high bits of the add. Verify that this is // the case. Instruction *OrigAdd = cast<Instruction>(AddWithCst->getOperand(0)); - for (Value::use_iterator UI = OrigAdd->use_begin(), E = OrigAdd->use_end(); - UI != E; ++UI) { - if (*UI == AddWithCst) continue; + for (User *U : OrigAdd->users()) { + if (U == AddWithCst) continue; // Only accept truncates for now. We would really like a nice recursive // predicate like SimplifyDemandedBits, but which goes downwards the use-def // chain to see which bits of a value are actually demanded. If the // original add had another add which was then immediately truncated, we // could still do the transformation. - TruncInst *TI = dyn_cast<TruncInst>(*UI); - if (TI == 0 || - TI->getType()->getPrimitiveSizeInBits() > NewWidth) return 0; + TruncInst *TI = dyn_cast<TruncInst>(U); + if (!TI || TI->getType()->getPrimitiveSizeInBits() > NewWidth) + return nullptr; } // If the pattern matches, truncate the inputs to the narrower type and @@ -1984,11 +1987,11 @@ static Instruction *ProcessUAddIdiom(Instruction &I, Value *OrigAddV, InstCombiner &IC) { // Don't bother doing this transformation for pointers, don't do it for // vectors. - if (!isa<IntegerType>(OrigAddV->getType())) return 0; + if (!isa<IntegerType>(OrigAddV->getType())) return nullptr; // If the add is a constant expr, then we don't bother transforming it. Instruction *OrigAdd = dyn_cast<Instruction>(OrigAddV); - if (OrigAdd == 0) return 0; + if (!OrigAdd) return nullptr; Value *LHS = OrigAdd->getOperand(0), *RHS = OrigAdd->getOperand(1); @@ -2009,6 +2012,240 @@ static Instruction *ProcessUAddIdiom(Instruction &I, Value *OrigAddV, return ExtractValueInst::Create(Call, 1, "uadd.overflow"); } +/// \brief Recognize and process idiom involving test for multiplication +/// overflow. +/// +/// The caller has matched a pattern of the form: +/// I = cmp u (mul(zext A, zext B), V +/// The function checks if this is a test for overflow and if so replaces +/// multiplication with call to 'mul.with.overflow' intrinsic. +/// +/// \param I Compare instruction. +/// \param MulVal Result of 'mult' instruction. It is one of the arguments of +/// the compare instruction. Must be of integer type. +/// \param OtherVal The other argument of compare instruction. +/// \returns Instruction which must replace the compare instruction, NULL if no +/// replacement required. +static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal, + Value *OtherVal, InstCombiner &IC) { + // Don't bother doing this transformation for pointers, don't do it for + // vectors. + if (!isa<IntegerType>(MulVal->getType())) + return nullptr; + + assert(I.getOperand(0) == MulVal || I.getOperand(1) == MulVal); + assert(I.getOperand(0) == OtherVal || I.getOperand(1) == OtherVal); + Instruction *MulInstr = cast<Instruction>(MulVal); + assert(MulInstr->getOpcode() == Instruction::Mul); + + Instruction *LHS = cast<Instruction>(MulInstr->getOperand(0)), + *RHS = cast<Instruction>(MulInstr->getOperand(1)); + assert(LHS->getOpcode() == Instruction::ZExt); + assert(RHS->getOpcode() == Instruction::ZExt); + Value *A = LHS->getOperand(0), *B = RHS->getOperand(0); + + // Calculate type and width of the result produced by mul.with.overflow. + Type *TyA = A->getType(), *TyB = B->getType(); + unsigned WidthA = TyA->getPrimitiveSizeInBits(), + WidthB = TyB->getPrimitiveSizeInBits(); + unsigned MulWidth; + Type *MulType; + if (WidthB > WidthA) { + MulWidth = WidthB; + MulType = TyB; + } else { + MulWidth = WidthA; + MulType = TyA; + } + + // In order to replace the original mul with a narrower mul.with.overflow, + // all uses must ignore upper bits of the product. The number of used low + // bits must be not greater than the width of mul.with.overflow. + if (MulVal->hasNUsesOrMore(2)) + for (User *U : MulVal->users()) { + if (U == &I) + continue; + if (TruncInst *TI = dyn_cast<TruncInst>(U)) { + // Check if truncation ignores bits above MulWidth. + unsigned TruncWidth = TI->getType()->getPrimitiveSizeInBits(); + if (TruncWidth > MulWidth) + return nullptr; + } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) { + // Check if AND ignores bits above MulWidth. + if (BO->getOpcode() != Instruction::And) + return nullptr; + if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) { + const APInt &CVal = CI->getValue(); + if (CVal.getBitWidth() - CVal.countLeadingZeros() > MulWidth) + return nullptr; + } + } else { + // Other uses prohibit this transformation. + return nullptr; + } + } + + // Recognize patterns + switch (I.getPredicate()) { + case ICmpInst::ICMP_EQ: + case ICmpInst::ICMP_NE: + // Recognize pattern: + // mulval = mul(zext A, zext B) + // cmp eq/neq mulval, zext trunc mulval + if (ZExtInst *Zext = dyn_cast<ZExtInst>(OtherVal)) + if (Zext->hasOneUse()) { + Value *ZextArg = Zext->getOperand(0); + if (TruncInst *Trunc = dyn_cast<TruncInst>(ZextArg)) + if (Trunc->getType()->getPrimitiveSizeInBits() == MulWidth) + break; //Recognized + } + + // Recognize pattern: + // mulval = mul(zext A, zext B) + // cmp eq/neq mulval, and(mulval, mask), mask selects low MulWidth bits. + ConstantInt *CI; + Value *ValToMask; + if (match(OtherVal, m_And(m_Value(ValToMask), m_ConstantInt(CI)))) { + if (ValToMask != MulVal) + return nullptr; + const APInt &CVal = CI->getValue() + 1; + if (CVal.isPowerOf2()) { + unsigned MaskWidth = CVal.logBase2(); + if (MaskWidth == MulWidth) + break; // Recognized + } + } + return nullptr; + + case ICmpInst::ICMP_UGT: + // Recognize pattern: + // mulval = mul(zext A, zext B) + // cmp ugt mulval, max + if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) { + APInt MaxVal = APInt::getMaxValue(MulWidth); + MaxVal = MaxVal.zext(CI->getBitWidth()); + if (MaxVal.eq(CI->getValue())) + break; // Recognized + } + return nullptr; + + case ICmpInst::ICMP_UGE: + // Recognize pattern: + // mulval = mul(zext A, zext B) + // cmp uge mulval, max+1 + if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) { + APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth); + if (MaxVal.eq(CI->getValue())) + break; // Recognized + } + return nullptr; + + case ICmpInst::ICMP_ULE: + // Recognize pattern: + // mulval = mul(zext A, zext B) + // cmp ule mulval, max + if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) { + APInt MaxVal = APInt::getMaxValue(MulWidth); + MaxVal = MaxVal.zext(CI->getBitWidth()); + if (MaxVal.eq(CI->getValue())) + break; // Recognized + } + return nullptr; + + case ICmpInst::ICMP_ULT: + // Recognize pattern: + // mulval = mul(zext A, zext B) + // cmp ule mulval, max + 1 + if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) { + APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth); + if (MaxVal.eq(CI->getValue())) + break; // Recognized + } + return nullptr; + + default: + return nullptr; + } + + InstCombiner::BuilderTy *Builder = IC.Builder; + Builder->SetInsertPoint(MulInstr); + Module *M = I.getParent()->getParent()->getParent(); + + // Replace: mul(zext A, zext B) --> mul.with.overflow(A, B) + Value *MulA = A, *MulB = B; + if (WidthA < MulWidth) + MulA = Builder->CreateZExt(A, MulType); + if (WidthB < MulWidth) + MulB = Builder->CreateZExt(B, MulType); + Value *F = + Intrinsic::getDeclaration(M, Intrinsic::umul_with_overflow, MulType); + CallInst *Call = Builder->CreateCall2(F, MulA, MulB, "umul"); + IC.Worklist.Add(MulInstr); + + // If there are uses of mul result other than the comparison, we know that + // they are truncation or binary AND. Change them to use result of + // mul.with.overflow and adjust properly mask/size. + if (MulVal->hasNUsesOrMore(2)) { + Value *Mul = Builder->CreateExtractValue(Call, 0, "umul.value"); + for (User *U : MulVal->users()) { + if (U == &I || U == OtherVal) + continue; + if (TruncInst *TI = dyn_cast<TruncInst>(U)) { + if (TI->getType()->getPrimitiveSizeInBits() == MulWidth) + IC.ReplaceInstUsesWith(*TI, Mul); + else + TI->setOperand(0, Mul); + } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) { + assert(BO->getOpcode() == Instruction::And); + // Replace (mul & mask) --> zext (mul.with.overflow & short_mask) + ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1)); + APInt ShortMask = CI->getValue().trunc(MulWidth); + Value *ShortAnd = Builder->CreateAnd(Mul, ShortMask); + Instruction *Zext = + cast<Instruction>(Builder->CreateZExt(ShortAnd, BO->getType())); + IC.Worklist.Add(Zext); + IC.ReplaceInstUsesWith(*BO, Zext); + } else { + llvm_unreachable("Unexpected Binary operation"); + } + IC.Worklist.Add(cast<Instruction>(U)); + } + } + if (isa<Instruction>(OtherVal)) + IC.Worklist.Add(cast<Instruction>(OtherVal)); + + // The original icmp gets replaced with the overflow value, maybe inverted + // depending on predicate. + bool Inverse = false; + switch (I.getPredicate()) { + case ICmpInst::ICMP_NE: + break; + case ICmpInst::ICMP_EQ: + Inverse = true; + break; + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_UGE: + if (I.getOperand(0) == MulVal) + break; + Inverse = true; + break; + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_ULE: + if (I.getOperand(1) == MulVal) + break; + Inverse = true; + break; + default: + llvm_unreachable("Unexpected predicate"); + } + if (Inverse) { + Value *Res = Builder->CreateExtractValue(Call, 1); + return BinaryOperator::CreateNot(Res); + } + + return ExtractValueInst::Create(Call, 1); +} + // DemandedBitsLHSMask - When performing a comparison against a constant, // it is possible that not all the bits in the LHS are demanded. This helper // method computes the mask that IS demanded. @@ -2048,7 +2285,7 @@ static APInt DemandedBitsLHSMask(ICmpInst &I, /// \brief Check if the order of \p Op0 and \p Op1 as operand in an ICmpInst /// should be swapped. -/// The descision is based on how many times these two operands are reused +/// The decision is based on how many times these two operands are reused /// as subtract operands and their positions in those instructions. /// The rational is that several architectures use the same instruction for /// both subtract and cmp, thus it is better if the order of those operands @@ -2064,12 +2301,12 @@ static bool swapMayExposeCSEOpportunities(const Value * Op0, // Each time Op0 is the first operand, count -1: swapping is bad, the // subtract has already the same layout as the compare. // Each time Op0 is the second operand, count +1: swapping is good, the - // subtract has a diffrent layout as the compare. + // subtract has a different layout as the compare. // At the end, if the benefit is greater than 0, Op0 should come second to // expose more CSE opportunities. int GlobalSwapBenefits = 0; - for (Value::const_use_iterator UI = Op0->use_begin(), UIEnd = Op0->use_end(); UI != UIEnd; ++UI) { - const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(*UI); + for (const User *U : Op0->users()) { + const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(U); if (!BinOp || BinOp->getOpcode() != Instruction::Sub) continue; // If Op0 is the first argument, this is not beneficial to swap the @@ -2104,7 +2341,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { Changed = true; } - if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, TD)) + if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, DL)) return ReplaceInstUsesWith(I, V); // comparing -val or val with non-zero is the same as just comparing val @@ -2172,14 +2409,14 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { unsigned BitWidth = 0; if (Ty->isIntOrIntVectorTy()) BitWidth = Ty->getScalarSizeInBits(); - else if (TD) // Pointers require TD info to get their size. - BitWidth = TD->getTypeSizeInBits(Ty->getScalarType()); + else if (DL) // Pointers require DL info to get their size. + BitWidth = DL->getTypeSizeInBits(Ty->getScalarType()); bool isSignBit = false; // See if we are doing a comparison with a constant. if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { - Value *A = 0, *B = 0; + Value *A = nullptr, *B = nullptr; // Match the following pattern, which is a common idiom when writing // overflow-safe integer arithmetic function. The source performs an @@ -2292,21 +2529,29 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { // bit is set. If the comparison is against zero, then this is a check // to see if *that* bit is set. APInt Op0KnownZeroInverted = ~Op0KnownZero; - if (~Op1KnownZero == 0 && Op0KnownZeroInverted.isPowerOf2()) { + if (~Op1KnownZero == 0) { // If the LHS is an AND with the same constant, look through it. - Value *LHS = 0; - ConstantInt *LHSC = 0; + Value *LHS = nullptr; + ConstantInt *LHSC = nullptr; if (!match(Op0, m_And(m_Value(LHS), m_ConstantInt(LHSC))) || LHSC->getValue() != Op0KnownZeroInverted) LHS = Op0; // If the LHS is 1 << x, and we know the result is a power of 2 like 8, // then turn "((1 << x)&8) == 0" into "x != 3". - Value *X = 0; + // or turn "((1 << x)&7) == 0" into "x > 2". + Value *X = nullptr; if (match(LHS, m_Shl(m_One(), m_Value(X)))) { - unsigned CmpVal = Op0KnownZeroInverted.countTrailingZeros(); - return new ICmpInst(ICmpInst::ICMP_NE, X, - ConstantInt::get(X->getType(), CmpVal)); + APInt ValToCheck = Op0KnownZeroInverted; + if (ValToCheck.isPowerOf2()) { + unsigned CmpVal = ValToCheck.countTrailingZeros(); + return new ICmpInst(ICmpInst::ICMP_NE, X, + ConstantInt::get(X->getType(), CmpVal)); + } else if ((++ValToCheck).isPowerOf2()) { + unsigned CmpVal = ValToCheck.countTrailingZeros() - 1; + return new ICmpInst(ICmpInst::ICMP_UGT, X, + ConstantInt::get(X->getType(), CmpVal)); + } } // If the LHS is 8 >>u x, and we know the result is a power of 2 like 1, @@ -2329,21 +2574,29 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { // bit is set. If the comparison is against zero, then this is a check // to see if *that* bit is set. APInt Op0KnownZeroInverted = ~Op0KnownZero; - if (~Op1KnownZero == 0 && Op0KnownZeroInverted.isPowerOf2()) { + if (~Op1KnownZero == 0) { // If the LHS is an AND with the same constant, look through it. - Value *LHS = 0; - ConstantInt *LHSC = 0; + Value *LHS = nullptr; + ConstantInt *LHSC = nullptr; if (!match(Op0, m_And(m_Value(LHS), m_ConstantInt(LHSC))) || LHSC->getValue() != Op0KnownZeroInverted) LHS = Op0; // If the LHS is 1 << x, and we know the result is a power of 2 like 8, // then turn "((1 << x)&8) != 0" into "x == 3". - Value *X = 0; + // or turn "((1 << x)&7) != 0" into "x < 3". + Value *X = nullptr; if (match(LHS, m_Shl(m_One(), m_Value(X)))) { - unsigned CmpVal = Op0KnownZeroInverted.countTrailingZeros(); - return new ICmpInst(ICmpInst::ICMP_EQ, X, - ConstantInt::get(X->getType(), CmpVal)); + APInt ValToCheck = Op0KnownZeroInverted; + if (ValToCheck.isPowerOf2()) { + unsigned CmpVal = ValToCheck.countTrailingZeros(); + return new ICmpInst(ICmpInst::ICMP_EQ, X, + ConstantInt::get(X->getType(), CmpVal)); + } else if ((++ValToCheck).isPowerOf2()) { + unsigned CmpVal = ValToCheck.countTrailingZeros(); + return new ICmpInst(ICmpInst::ICMP_ULT, X, + ConstantInt::get(X->getType(), CmpVal)); + } } // If the LHS is 8 >>u x, and we know the result is a power of 2 like 1, @@ -2468,10 +2721,10 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { // operands has at least one user besides the compare (the select), // which would often largely negate the benefit of folding anyway. if (I.hasOneUse()) - if (SelectInst *SI = dyn_cast<SelectInst>(*I.use_begin())) + if (SelectInst *SI = dyn_cast<SelectInst>(*I.user_begin())) if ((SI->getOperand(1) == Op0 && SI->getOperand(2) == Op1) || (SI->getOperand(2) == Op0 && SI->getOperand(1) == Op1)) - return 0; + return nullptr; // See if we are doing a comparison between a constant and an instruction that // can be folded into the comparison. @@ -2507,7 +2760,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { // If either operand of the select is a constant, we can fold the // comparison into the select arms, which will cause one to be // constant folded and the select turned into a bitwise or. - Value *Op1 = 0, *Op2 = 0; + Value *Op1 = nullptr, *Op2 = nullptr; if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC); if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) @@ -2532,8 +2785,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { } case Instruction::IntToPtr: // icmp pred inttoptr(X), null -> icmp pred X, 0 - if (RHSC->isNullValue() && TD && - TD->getIntPtrType(RHSC->getType()) == + if (RHSC->isNullValue() && DL && + DL->getIntPtrType(RHSC->getType()) == LHSI->getOperand(0)->getType()) return new ICmpInst(I.getPredicate(), LHSI->getOperand(0), Constant::getNullValue(LHSI->getOperand(0)->getType())); @@ -2619,7 +2872,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { // Analyze the case when either Op0 or Op1 is an add instruction. // Op0 = A + B (or A and B are null); Op1 = C + D (or C and D are null). - Value *A = 0, *B = 0, *C = 0, *D = 0; + Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr; if (BO0 && BO0->getOpcode() == Instruction::Add) A = BO0->getOperand(0), B = BO0->getOperand(1); if (BO1 && BO1->getOpcode() == Instruction::Add) @@ -2714,7 +2967,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { // Analyze the case when either Op0 or Op1 is a sub instruction. // Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null). - A = 0; B = 0; C = 0; D = 0; + A = nullptr; B = nullptr; C = nullptr; D = nullptr; if (BO0 && BO0->getOpcode() == Instruction::Sub) A = BO0->getOperand(0), B = BO0->getOperand(1); if (BO1 && BO1->getOpcode() == Instruction::Sub) @@ -2740,7 +2993,17 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { BO0->hasOneUse() && BO1->hasOneUse()) return new ICmpInst(Pred, D, B); - BinaryOperator *SRem = NULL; + // icmp (0-X) < cst --> x > -cst + if (NoOp0WrapProblem && ICmpInst::isSigned(Pred)) { + Value *X; + if (match(BO0, m_Neg(m_Value(X)))) + if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1)) + if (!RHSC->isMinValue(/*isSigned=*/true)) + return new ICmpInst(I.getSwappedPredicate(), X, + ConstantExpr::getNeg(RHSC)); + } + + BinaryOperator *SRem = nullptr; // icmp (srem X, Y), Y if (BO0 && BO0->getOpcode() == Instruction::SRem && Op1 == BO0->getOperand(1)) @@ -2878,6 +3141,16 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { (Op0 == A || Op0 == B)) if (Instruction *R = ProcessUAddIdiom(I, Op1, *this)) return R; + + // (zext a) * (zext b) --> llvm.umul.with.overflow. + if (match(Op0, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) { + if (Instruction *R = ProcessUMulZExtIdiom(I, Op0, Op1, *this)) + return R; + } + if (match(Op1, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) { + if (Instruction *R = ProcessUMulZExtIdiom(I, Op1, Op0, *this)) + return R; + } } if (I.isEquality()) { @@ -2919,7 +3192,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { // (X&Z) == (Y&Z) -> (X^Y) & Z == 0 if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) && match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) { - Value *X = 0, *Y = 0, *Z = 0; + Value *X = nullptr, *Y = nullptr, *Z = nullptr; if (A == C) { X = B; Y = D; Z = A; @@ -3010,7 +3283,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { if (match(Op1, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op0 == X) return FoldICmpAddOpCst(I, X, Cst, I.getSwappedPredicate()); } - return Changed ? &I : 0; + return Changed ? &I : nullptr; } /// FoldFCmp_IntToFP_Cst - Fold fcmp ([us]itofp x, cst) if possible. @@ -3018,13 +3291,13 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, Instruction *LHSI, Constant *RHSC) { - if (!isa<ConstantFP>(RHSC)) return 0; + if (!isa<ConstantFP>(RHSC)) return nullptr; const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF(); // Get the width of the mantissa. We don't want to hack on conversions that // might lose information from the integer, e.g. "i64 -> float" int MantissaWidth = LHSI->getType()->getFPMantissaWidth(); - if (MantissaWidth == -1) return 0; // Unknown. + if (MantissaWidth == -1) return nullptr; // Unknown. // Check to see that the input is converted from an integer type that is small // enough that preserves all bits. TODO: check here for "known" sign bits. @@ -3038,7 +3311,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, // If the conversion would lose info, don't hack on this. if ((int)InputSize > MantissaWidth) - return 0; + return nullptr; // Otherwise, we can potentially simplify the comparison. We know that it // will always come through as an integer value and we know the constant is @@ -3229,7 +3502,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1, TD)) + if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1, DL)) return ReplaceInstUsesWith(I, V); // Simplify 'fcmp pred X, X' @@ -3313,31 +3586,6 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) { if (Instruction *NV = FoldFCmp_IntToFP_Cst(I, LHSI, RHSC)) return NV; break; - case Instruction::Select: { - // If either operand of the select is a constant, we can fold the - // comparison into the select arms, which will cause one to be - // constant folded and the select turned into a bitwise or. - Value *Op1 = 0, *Op2 = 0; - if (LHSI->hasOneUse()) { - if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) { - // Fold the known value into the constant operand. - Op1 = ConstantExpr::getCompare(I.getPredicate(), C, RHSC); - // Insert a new FCmp of the other select operand. - Op2 = Builder->CreateFCmp(I.getPredicate(), - LHSI->getOperand(2), RHSC, I.getName()); - } else if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) { - // Fold the known value into the constant operand. - Op2 = ConstantExpr::getCompare(I.getPredicate(), C, RHSC); - // Insert a new FCmp of the other select operand. - Op1 = Builder->CreateFCmp(I.getPredicate(), LHSI->getOperand(1), - RHSC, I.getName()); - } - } - - if (Op1) - return SelectInst::Create(LHSI->getOperand(0), Op1, Op2); - break; - } case Instruction::FSub: { // fcmp pred (fneg x), C -> fcmp swap(pred) x, -C Value *Op; @@ -3409,5 +3657,5 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) { return new FCmpInst(I.getPredicate(), LHSExt->getOperand(0), RHSExt->getOperand(0)); - return Changed ? &I : 0; + return Changed ? &I : nullptr; } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 4c861b3..e9c25d3 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -20,6 +20,8 @@ #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "instcombine" + STATISTIC(NumDeadStore, "Number of dead stores eliminated"); STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global"); @@ -29,10 +31,13 @@ STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global"); static bool pointsToConstantGlobal(Value *V) { if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) return GV->isConstant(); - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) + + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) { if (CE->getOpcode() == Instruction::BitCast || + CE->getOpcode() == Instruction::AddrSpaceCast || CE->getOpcode() == Instruction::GetElementPtr) return pointsToConstantGlobal(CE->getOperand(0)); + } return false; } @@ -45,95 +50,102 @@ static bool pointsToConstantGlobal(Value *V) { /// can optimize this. static bool isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, - SmallVectorImpl<Instruction *> &ToDelete, - bool IsOffset = false) { + SmallVectorImpl<Instruction *> &ToDelete) { // We track lifetime intrinsics as we encounter them. If we decide to go // ahead and replace the value with the global, this lets the caller quickly // eliminate the markers. - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { - User *U = cast<Instruction>(*UI); - - if (LoadInst *LI = dyn_cast<LoadInst>(U)) { - // Ignore non-volatile loads, they are always ok. - if (!LI->isSimple()) return false; - continue; - } - - if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { - // If uses of the bitcast are ok, we are ok. - if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, ToDelete, IsOffset)) - return false; - continue; - } - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) { - // If the GEP has all zero indices, it doesn't offset the pointer. If it - // doesn't, it does. - if (!isOnlyCopiedFromConstantGlobal( - GEP, TheCopy, ToDelete, IsOffset || !GEP->hasAllZeroIndices())) - return false; - continue; - } - - if (CallSite CS = U) { - // If this is the function being called then we treat it like a load and - // ignore it. - if (CS.isCallee(UI)) + SmallVector<std::pair<Value *, bool>, 35> ValuesToInspect; + ValuesToInspect.push_back(std::make_pair(V, false)); + while (!ValuesToInspect.empty()) { + auto ValuePair = ValuesToInspect.pop_back_val(); + const bool IsOffset = ValuePair.second; + for (auto &U : ValuePair.first->uses()) { + Instruction *I = cast<Instruction>(U.getUser()); + + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + // Ignore non-volatile loads, they are always ok. + if (!LI->isSimple()) return false; continue; + } - // If this is a readonly/readnone call site, then we know it is just a - // load (but one that potentially returns the value itself), so we can - // ignore it if we know that the value isn't captured. - unsigned ArgNo = CS.getArgumentNo(UI); - if (CS.onlyReadsMemory() && - (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo))) + if (isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I)) { + // If uses of the bitcast are ok, we are ok. + ValuesToInspect.push_back(std::make_pair(I, IsOffset)); continue; - - // If this is being passed as a byval argument, the caller is making a - // copy, so it is only a read of the alloca. - if (CS.isByValArgument(ArgNo)) + } + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { + // If the GEP has all zero indices, it doesn't offset the pointer. If it + // doesn't, it does. + ValuesToInspect.push_back( + std::make_pair(I, IsOffset || !GEP->hasAllZeroIndices())); continue; - } + } - // Lifetime intrinsics can be handled by the caller. - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) { - if (II->getIntrinsicID() == Intrinsic::lifetime_start || - II->getIntrinsicID() == Intrinsic::lifetime_end) { - assert(II->use_empty() && "Lifetime markers have no result to use!"); - ToDelete.push_back(II); - continue; + if (CallSite CS = I) { + // If this is the function being called then we treat it like a load and + // ignore it. + if (CS.isCallee(&U)) + continue; + + // Inalloca arguments are clobbered by the call. + unsigned ArgNo = CS.getArgumentNo(&U); + if (CS.isInAllocaArgument(ArgNo)) + return false; + + // If this is a readonly/readnone call site, then we know it is just a + // load (but one that potentially returns the value itself), so we can + // ignore it if we know that the value isn't captured. + if (CS.onlyReadsMemory() && + (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo))) + continue; + + // If this is being passed as a byval argument, the caller is making a + // copy, so it is only a read of the alloca. + if (CS.isByValArgument(ArgNo)) + continue; } - } - // If this is isn't our memcpy/memmove, reject it as something we can't - // handle. - MemTransferInst *MI = dyn_cast<MemTransferInst>(U); - if (MI == 0) - return false; + // Lifetime intrinsics can be handled by the caller. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end) { + assert(II->use_empty() && "Lifetime markers have no result to use!"); + ToDelete.push_back(II); + continue; + } + } - // If the transfer is using the alloca as a source of the transfer, then - // ignore it since it is a load (unless the transfer is volatile). - if (UI.getOperandNo() == 1) { - if (MI->isVolatile()) return false; - continue; - } + // If this is isn't our memcpy/memmove, reject it as something we can't + // handle. + MemTransferInst *MI = dyn_cast<MemTransferInst>(I); + if (!MI) + return false; - // If we already have seen a copy, reject the second one. - if (TheCopy) return false; + // If the transfer is using the alloca as a source of the transfer, then + // ignore it since it is a load (unless the transfer is volatile). + if (U.getOperandNo() == 1) { + if (MI->isVolatile()) return false; + continue; + } - // If the pointer has been offset from the start of the alloca, we can't - // safely handle this. - if (IsOffset) return false; + // If we already have seen a copy, reject the second one. + if (TheCopy) return false; - // If the memintrinsic isn't using the alloca as the dest, reject it. - if (UI.getOperandNo() != 0) return false; + // If the pointer has been offset from the start of the alloca, we can't + // safely handle this. + if (IsOffset) return false; - // If the source of the memcpy/move is not a constant global, reject it. - if (!pointsToConstantGlobal(MI->getSource())) - return false; + // If the memintrinsic isn't using the alloca as the dest, reject it. + if (U.getOperandNo() != 0) return false; + + // If the source of the memcpy/move is not a constant global, reject it. + if (!pointsToConstantGlobal(MI->getSource())) + return false; - // Otherwise, the transform is safe. Remember the copy instruction. - TheCopy = MI; + // Otherwise, the transform is safe. Remember the copy instruction. + TheCopy = MI; + } } return true; } @@ -144,17 +156,17 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, static MemTransferInst * isOnlyCopiedFromConstantGlobal(AllocaInst *AI, SmallVectorImpl<Instruction *> &ToDelete) { - MemTransferInst *TheCopy = 0; + MemTransferInst *TheCopy = nullptr; if (isOnlyCopiedFromConstantGlobal(AI, TheCopy, ToDelete)) return TheCopy; - return 0; + return nullptr; } Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // Ensure that the alloca array size argument has type intptr_t, so that // any casting is exposed early. - if (TD) { - Type *IntPtrTy = TD->getIntPtrType(AI.getType()); + if (DL) { + Type *IntPtrTy = DL->getIntPtrType(AI.getType()); if (AI.getArraySize()->getType() != IntPtrTy) { Value *V = Builder->CreateIntCast(AI.getArraySize(), IntPtrTy, false); @@ -168,7 +180,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) { Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue()); - AllocaInst *New = Builder->CreateAlloca(NewTy, 0, AI.getName()); + AllocaInst *New = Builder->CreateAlloca(NewTy, nullptr, AI.getName()); New->setAlignment(AI.getAlignment()); // Scan to the end of the allocation instructions, to skip over a block of @@ -180,8 +192,8 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // Now that I is pointing to the first non-allocation-inst in the block, // insert our getelementptr instruction... // - Type *IdxTy = TD - ? TD->getIntPtrType(AI.getType()) + Type *IdxTy = DL + ? DL->getIntPtrType(AI.getType()) : Type::getInt64Ty(AI.getContext()); Value *NullIdx = Constant::getNullValue(IdxTy); Value *Idx[2] = { NullIdx, NullIdx }; @@ -197,15 +209,15 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { } } - if (TD && AI.getAllocatedType()->isSized()) { + if (DL && AI.getAllocatedType()->isSized()) { // If the alignment is 0 (unspecified), assign it the preferred alignment. if (AI.getAlignment() == 0) - AI.setAlignment(TD->getPrefTypeAlignment(AI.getAllocatedType())); + AI.setAlignment(DL->getPrefTypeAlignment(AI.getAllocatedType())); // Move all alloca's of zero byte objects to the entry block and merge them // together. Note that we only do this for alloca's, because malloc should // allocate and return a unique pointer, even for a zero byte allocation. - if (TD->getTypeAllocSize(AI.getAllocatedType()) == 0) { + if (DL->getTypeAllocSize(AI.getAllocatedType()) == 0) { // For a zero sized alloca there is no point in doing an array allocation. // This is helpful if the array size is a complicated expression not used // elsewhere. @@ -223,7 +235,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // dominance as the array size was forced to a constant earlier already. AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst); if (!EntryAI || !EntryAI->getAllocatedType()->isSized() || - TD->getTypeAllocSize(EntryAI->getAllocatedType()) != 0) { + DL->getTypeAllocSize(EntryAI->getAllocatedType()) != 0) { AI.moveBefore(FirstInst); return &AI; } @@ -232,7 +244,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // assign it the preferred alignment. if (EntryAI->getAlignment() == 0) EntryAI->setAlignment( - TD->getPrefTypeAlignment(EntryAI->getAllocatedType())); + DL->getPrefTypeAlignment(EntryAI->getAllocatedType())); // Replace this zero-sized alloca with the one at the start of the entry // block after ensuring that the address will be aligned enough for both // types. @@ -256,7 +268,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { SmallVector<Instruction *, 4> ToDelete; if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) { unsigned SourceAlign = getOrEnforceKnownAlignment(Copy->getSource(), - AI.getAlignment(), TD); + AI.getAlignment(), DL); if (AI.getAlignment() <= SourceAlign) { DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n'); DEBUG(dbgs() << " memcpy = " << *Copy << '\n'); @@ -281,7 +293,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { /// InstCombineLoadCast - Fold 'load (cast P)' -> cast (load P)' when possible. static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI, - const DataLayout *TD) { + const DataLayout *DL) { User *CI = cast<User>(LI.getOperand(0)); Value *CastOp = CI->getOperand(0); @@ -291,7 +303,7 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI, // If the address spaces don't match, don't eliminate the cast. if (DestTy->getAddressSpace() != SrcTy->getAddressSpace()) - return 0; + return nullptr; Type *SrcPTy = SrcTy->getElementType(); @@ -303,8 +315,8 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI, if (ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy)) if (Constant *CSrc = dyn_cast<Constant>(CastOp)) if (ASrcTy->getNumElements() != 0) { - Type *IdxTy = TD - ? TD->getIntPtrType(SrcTy) + Type *IdxTy = DL + ? DL->getIntPtrType(SrcTy) : Type::getInt64Ty(SrcTy->getContext()); Value *Idx = Constant::getNullValue(IdxTy); Value *Idxs[2] = { Idx, Idx }; @@ -331,23 +343,30 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI, NewLoad->setAlignment(LI.getAlignment()); NewLoad->setAtomic(LI.getOrdering(), LI.getSynchScope()); // Now cast the result of the load. + PointerType *OldTy = dyn_cast<PointerType>(NewLoad->getType()); + PointerType *NewTy = dyn_cast<PointerType>(LI.getType()); + if (OldTy && NewTy && + OldTy->getAddressSpace() != NewTy->getAddressSpace()) { + return new AddrSpaceCastInst(NewLoad, LI.getType()); + } + return new BitCastInst(NewLoad, LI.getType()); } } } - return 0; + return nullptr; } Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { Value *Op = LI.getOperand(0); // Attempt to improve the alignment. - if (TD) { + if (DL) { unsigned KnownAlign = - getOrEnforceKnownAlignment(Op, TD->getPrefTypeAlignment(LI.getType()),TD); + getOrEnforceKnownAlignment(Op, DL->getPrefTypeAlignment(LI.getType()),DL); unsigned LoadAlign = LI.getAlignment(); unsigned EffectiveLoadAlign = LoadAlign != 0 ? LoadAlign : - TD->getABITypeAlignment(LI.getType()); + DL->getABITypeAlignment(LI.getType()); if (KnownAlign > EffectiveLoadAlign) LI.setAlignment(KnownAlign); @@ -357,12 +376,12 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { // load (cast X) --> cast (load X) iff safe. if (isa<CastInst>(Op)) - if (Instruction *Res = InstCombineLoadCast(*this, LI, TD)) + if (Instruction *Res = InstCombineLoadCast(*this, LI, DL)) return Res; // None of the following transforms are legal for volatile/atomic loads. // FIXME: Some of it is okay for atomic loads; needs refactoring. - if (!LI.isSimple()) return 0; + if (!LI.isSimple()) return nullptr; // Do really simple store-to-load forwarding and load CSE, to catch cases // where there are several consecutive memory accesses to the same location, @@ -401,7 +420,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { // Instcombine load (constantexpr_cast global) -> cast (load global) if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op)) if (CE->isCast()) - if (Instruction *Res = InstCombineLoadCast(*this, LI, TD)) + if (Instruction *Res = InstCombineLoadCast(*this, LI, DL)) return Res; if (Op->hasOneUse()) { @@ -418,8 +437,8 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { if (SelectInst *SI = dyn_cast<SelectInst>(Op)) { // load (select (Cond, &V1, &V2)) --> select(Cond, load &V1, load &V2). unsigned Align = LI.getAlignment(); - if (isSafeToLoadUnconditionally(SI->getOperand(1), SI, Align, TD) && - isSafeToLoadUnconditionally(SI->getOperand(2), SI, Align, TD)) { + if (isSafeToLoadUnconditionally(SI->getOperand(1), SI, Align, DL) && + isSafeToLoadUnconditionally(SI->getOperand(2), SI, Align, DL)) { LoadInst *V1 = Builder->CreateLoad(SI->getOperand(1), SI->getOperand(1)->getName()+".val"); LoadInst *V2 = Builder->CreateLoad(SI->getOperand(2), @@ -444,7 +463,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { } } } - return 0; + return nullptr; } /// InstCombineStoreToCast - Fold store V, (cast P) -> store (cast V), P @@ -454,14 +473,14 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) { User *CI = cast<User>(SI.getOperand(1)); Value *CastOp = CI->getOperand(0); - Type *DestPTy = cast<PointerType>(CI->getType())->getElementType(); + Type *DestPTy = CI->getType()->getPointerElementType(); PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType()); - if (SrcTy == 0) return 0; + if (!SrcTy) return nullptr; Type *SrcPTy = SrcTy->getElementType(); if (!DestPTy->isIntegerTy() && !DestPTy->isPointerTy()) - return 0; + return nullptr; /// NewGEPIndices - If SrcPTy is an aggregate type, we can emit a "noop gep" /// to its first element. This allows us to handle things like: @@ -495,30 +514,40 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) { } if (!SrcPTy->isIntegerTy() && !SrcPTy->isPointerTy()) - return 0; + return nullptr; + + // If the pointers point into different address spaces don't do the + // transformation. + if (SrcTy->getAddressSpace() != CI->getType()->getPointerAddressSpace()) + return nullptr; - // If the pointers point into different address spaces or if they point to - // values with different sizes, we can't do the transformation. + // If the pointers point to values of different sizes don't do the + // transformation. if (!IC.getDataLayout() || - SrcTy->getAddressSpace() != - cast<PointerType>(CI->getType())->getAddressSpace() || IC.getDataLayout()->getTypeSizeInBits(SrcPTy) != IC.getDataLayout()->getTypeSizeInBits(DestPTy)) - return 0; + return nullptr; + + // If the pointers point to pointers to different address spaces don't do the + // transformation. It is not safe to introduce an addrspacecast instruction in + // this case since, depending on the target, addrspacecast may not be a no-op + // cast. + if (SrcPTy->isPointerTy() && DestPTy->isPointerTy() && + SrcPTy->getPointerAddressSpace() != DestPTy->getPointerAddressSpace()) + return nullptr; // Okay, we are casting from one integer or pointer type to another of // the same size. Instead of casting the pointer before // the store, cast the value to be stored. Value *NewCast; - Value *SIOp0 = SI.getOperand(0); Instruction::CastOps opcode = Instruction::BitCast; - Type* CastSrcTy = SIOp0->getType(); + Type* CastSrcTy = DestPTy; Type* CastDstTy = SrcPTy; if (CastDstTy->isPointerTy()) { if (CastSrcTy->isIntegerTy()) opcode = Instruction::IntToPtr; } else if (CastDstTy->isIntegerTy()) { - if (SIOp0->getType()->isPointerTy()) + if (CastSrcTy->isPointerTy()) opcode = Instruction::PtrToInt; } @@ -527,6 +556,7 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) { if (!NewGEPIndices.empty()) CastOp = IC.Builder->CreateInBoundsGEP(CastOp, NewGEPIndices); + Value *SIOp0 = SI.getOperand(0); NewCast = IC.Builder->CreateCast(opcode, SIOp0, CastDstTy, SIOp0->getName()+".c"); SI.setOperand(0, NewCast); @@ -568,13 +598,13 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { Value *Ptr = SI.getOperand(1); // Attempt to improve the alignment. - if (TD) { + if (DL) { unsigned KnownAlign = - getOrEnforceKnownAlignment(Ptr, TD->getPrefTypeAlignment(Val->getType()), - TD); + getOrEnforceKnownAlignment(Ptr, DL->getPrefTypeAlignment(Val->getType()), + DL); unsigned StoreAlign = SI.getAlignment(); unsigned EffectiveStoreAlign = StoreAlign != 0 ? StoreAlign : - TD->getABITypeAlignment(Val->getType()); + DL->getABITypeAlignment(Val->getType()); if (KnownAlign > EffectiveStoreAlign) SI.setAlignment(KnownAlign); @@ -584,7 +614,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { // Don't hack volatile/atomic stores. // FIXME: Some bits are legal for atomic stores; needs refactoring. - if (!SI.isSimple()) return 0; + if (!SI.isSimple()) return nullptr; // If the RHS is an alloca with a single use, zapify the store, making the // alloca dead. @@ -651,7 +681,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { if (Instruction *U = dyn_cast<Instruction>(Val)) Worklist.Add(U); // Dropped a use. } - return 0; // Do not modify these! + return nullptr; // Do not modify these! } // store undef, Ptr -> noop @@ -680,9 +710,9 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { if (BranchInst *BI = dyn_cast<BranchInst>(BBI)) if (BI->isUnconditional()) if (SimplifyStoreAtEndOfBlock(SI)) - return 0; // xform done! + return nullptr; // xform done! - return 0; + return nullptr; } /// SimplifyStoreAtEndOfBlock - Turn things like: @@ -705,7 +735,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) { // the other predecessor. pred_iterator PI = pred_begin(DestBB); BasicBlock *P = *PI; - BasicBlock *OtherBB = 0; + BasicBlock *OtherBB = nullptr; if (P != StoreBB) OtherBB = P; @@ -735,7 +765,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) { // If the other block ends in an unconditional branch, check for the 'if then // else' case. there is an instruction before the branch. - StoreInst *OtherStore = 0; + StoreInst *OtherStore = nullptr; if (OtherBr->isUnconditional()) { --BBI; // Skip over debugging info. diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index a759548..6c6e7d8 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -15,10 +15,12 @@ #include "InstCombine.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/Support/PatternMatch.h" +#include "llvm/IR/PatternMatch.h" using namespace llvm; using namespace PatternMatch; +#define DEBUG_TYPE "instcombine" + /// simplifyValueKnownNonZero - The specific integer value is used in a context /// where it is known to be non-zero. If this allows us to simplify the @@ -27,13 +29,13 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) { // If V has multiple uses, then we would have to do more analysis to determine // if this is safe. For example, the use could be in dynamically unreached // code. - if (!V->hasOneUse()) return 0; + if (!V->hasOneUse()) return nullptr; bool MadeChange = false; // ((1 << A) >>u B) --> (1 << (A-B)) // Because V cannot be zero, we know that B is less than A. - Value *A = 0, *B = 0, *PowerOf2 = 0; + Value *A = nullptr, *B = nullptr, *PowerOf2 = nullptr; if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(PowerOf2), m_Value(A))), m_Value(B))) && // The "1" can be any value known to be a power of 2. @@ -68,7 +70,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) { // If V is a phi node, we can call this on each of its operands. // "select cond, X, 0" can simplify to "X". - return MadeChange ? V : 0; + return MadeChange ? V : nullptr; } @@ -107,7 +109,7 @@ static Constant *getLogBase2Vector(ConstantDataVector *CV) { for (unsigned I = 0, E = CV->getNumElements(); I != E; ++I) { Constant *Elt = CV->getElementAsConstant(I); if (!match(Elt, m_APInt(IVal)) || !IVal->isPowerOf2()) - return 0; + return nullptr; Elts.push_back(ConstantInt::get(Elt->getType(), IVal->logBase2())); } @@ -118,7 +120,10 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (Value *V = SimplifyMulInst(Op0, Op1, TD)) + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + + if (Value *V = SimplifyMulInst(Op0, Op1, DL)) return ReplaceInstUsesWith(I, V); if (Value *V = SimplifyUsingDistributiveLaws(I)) @@ -139,7 +144,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { return BinaryOperator::CreateMul(NewOp, ConstantExpr::getShl(C1, C2)); if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) { - Constant *NewCst = 0; + Constant *NewCst = nullptr; if (match(C1, m_APInt(IVal)) && IVal->isPowerOf2()) // Replace X*(2^C) with X << C, where C is either a scalar or a splat. NewCst = ConstantInt::get(NewOp->getType(), IVal->logBase2()); @@ -158,15 +163,6 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { } if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { - // Canonicalize (X+C1)*CI -> X*CI+C1*CI. - { Value *X; ConstantInt *C1; - if (Op0->hasOneUse() && - match(Op0, m_Add(m_Value(X), m_ConstantInt(C1)))) { - Value *Add = Builder->CreateMul(X, CI); - return BinaryOperator::CreateAdd(Add, Builder->CreateMul(C1, CI)); - } - } - // (Y - X) * (-(2**n)) -> (X - Y) * (2**n), for positive nonzero n // (Y + const) * (-(2**n)) -> (-constY) * (2**n), for positive nonzero n // The "* (2**n)" thus becomes a potential shifting opportunity. @@ -174,10 +170,10 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { const APInt & Val = CI->getValue(); const APInt &PosVal = Val.abs(); if (Val.isNegative() && PosVal.isPowerOf2()) { - Value *X = 0, *Y = 0; + Value *X = nullptr, *Y = nullptr; if (Op0->hasOneUse()) { ConstantInt *C1; - Value *Sub = 0; + Value *Sub = nullptr; if (match(Op0, m_Sub(m_Value(Y), m_Value(X)))) Sub = Builder->CreateSub(X, Y, "suba"); else if (match(Op0, m_Add(m_Value(Y), m_ConstantInt(C1)))) @@ -201,6 +197,19 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { if (isa<PHINode>(Op0)) if (Instruction *NV = FoldOpIntoPhi(I)) return NV; + + // Canonicalize (X+C1)*CI -> X*CI+C1*CI. + { + Value *X; + Constant *C1; + if (match(Op0, m_OneUse(m_Add(m_Value(X), m_Constant(C1))))) { + Value *Mul = Builder->CreateMul(C1, Op1); + // Only go forward with the transform if C1*CI simplifies to a tidier + // constant. + if (!match(Mul, m_Mul(m_Value(), m_Value()))) + return BinaryOperator::CreateAdd(Builder->CreateMul(X, Op1), Mul); + } + } } if (Value *Op0v = dyn_castNegVal(Op0)) // -X * -Y = X*Y @@ -247,7 +256,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { } /// i1 mul -> i1 and. - if (I.getType()->isIntegerTy(1)) + if (I.getType()->getScalarType()->isIntegerTy(1)) return BinaryOperator::CreateAnd(Op0, Op1); // X*(1 << Y) --> X << Y @@ -267,7 +276,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { // -2 is "-1 << 1" so it is all bits set except the low one. APInt Negative2(I.getType()->getPrimitiveSizeInBits(), (uint64_t)-2, true); - Value *BoolCast = 0, *OtherOp = 0; + Value *BoolCast = nullptr, *OtherOp = nullptr; if (MaskedValueIsZero(Op0, Negative2)) BoolCast = Op0, OtherOp = Op1; else if (MaskedValueIsZero(Op1, Negative2)) @@ -280,7 +289,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { } } - return Changed ? &I : 0; + return Changed ? &I : nullptr; } // @@ -313,16 +322,41 @@ static void detectLog2OfHalf(Value *&Op, Value *&Y, IntrinsicInst *&Log2) { if (I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra()) return; - ConstantFP *CFP = dyn_cast<ConstantFP>(I->getOperand(0)); - if (CFP && CFP->isExactlyValue(0.5)) { + if (match(I->getOperand(0), m_SpecificFP(0.5))) Y = I->getOperand(1); - return; - } - CFP = dyn_cast<ConstantFP>(I->getOperand(1)); - if (CFP && CFP->isExactlyValue(0.5)) + else if (match(I->getOperand(1), m_SpecificFP(0.5))) Y = I->getOperand(0); } +static bool isFiniteNonZeroFp(Constant *C) { + if (C->getType()->isVectorTy()) { + for (unsigned I = 0, E = C->getType()->getVectorNumElements(); I != E; + ++I) { + ConstantFP *CFP = dyn_cast<ConstantFP>(C->getAggregateElement(I)); + if (!CFP || !CFP->getValueAPF().isFiniteNonZero()) + return false; + } + return true; + } + + return isa<ConstantFP>(C) && + cast<ConstantFP>(C)->getValueAPF().isFiniteNonZero(); +} + +static bool isNormalFp(Constant *C) { + if (C->getType()->isVectorTy()) { + for (unsigned I = 0, E = C->getType()->getVectorNumElements(); I != E; + ++I) { + ConstantFP *CFP = dyn_cast<ConstantFP>(C->getAggregateElement(I)); + if (!CFP || !CFP->getValueAPF().isNormal()) + return false; + } + return true; + } + + return isa<ConstantFP>(C) && cast<ConstantFP>(C)->getValueAPF().isNormal(); +} + /// Helper function of InstCombiner::visitFMul(BinaryOperator(). It returns /// true iff the given value is FMul or FDiv with one and only one operand /// being a normal constant (i.e. not Zero/NaN/Infinity). @@ -332,19 +366,13 @@ static bool isFMulOrFDivWithConstant(Value *V) { I->getOpcode() != Instruction::FDiv)) return false; - ConstantFP *C0 = dyn_cast<ConstantFP>(I->getOperand(0)); - ConstantFP *C1 = dyn_cast<ConstantFP>(I->getOperand(1)); + Constant *C0 = dyn_cast<Constant>(I->getOperand(0)); + Constant *C1 = dyn_cast<Constant>(I->getOperand(1)); if (C0 && C1) return false; - return (C0 && C0->getValueAPF().isFiniteNonZero()) || - (C1 && C1->getValueAPF().isFiniteNonZero()); -} - -static bool isNormalFp(const ConstantFP *C) { - const APFloat &Flt = C->getValueAPF(); - return Flt.isNormal(); + return (C0 && isFiniteNonZeroFp(C0)) || (C1 && isFiniteNonZeroFp(C1)); } /// foldFMulConst() is a helper routine of InstCombiner::visitFMul(). @@ -354,41 +382,41 @@ static bool isNormalFp(const ConstantFP *C) { /// resulting expression. Note that this function could return NULL in /// case the constants cannot be folded into a normal floating-point. /// -Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, ConstantFP *C, +Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, Constant *C, Instruction *InsertBefore) { assert(isFMulOrFDivWithConstant(FMulOrDiv) && "V is invalid"); Value *Opnd0 = FMulOrDiv->getOperand(0); Value *Opnd1 = FMulOrDiv->getOperand(1); - ConstantFP *C0 = dyn_cast<ConstantFP>(Opnd0); - ConstantFP *C1 = dyn_cast<ConstantFP>(Opnd1); + Constant *C0 = dyn_cast<Constant>(Opnd0); + Constant *C1 = dyn_cast<Constant>(Opnd1); - BinaryOperator *R = 0; + BinaryOperator *R = nullptr; // (X * C0) * C => X * (C0*C) if (FMulOrDiv->getOpcode() == Instruction::FMul) { Constant *F = ConstantExpr::getFMul(C1 ? C1 : C0, C); - if (isNormalFp(cast<ConstantFP>(F))) + if (isNormalFp(F)) R = BinaryOperator::CreateFMul(C1 ? Opnd0 : Opnd1, F); } else { if (C0) { // (C0 / X) * C => (C0 * C) / X if (FMulOrDiv->hasOneUse()) { // It would otherwise introduce another div. - ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFMul(C0, C)); + Constant *F = ConstantExpr::getFMul(C0, C); if (isNormalFp(F)) R = BinaryOperator::CreateFDiv(F, Opnd1); } } else { // (X / C1) * C => X * (C/C1) if C/C1 is not a denormal - ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFDiv(C, C1)); + Constant *F = ConstantExpr::getFDiv(C, C1); if (isNormalFp(F)) { R = BinaryOperator::CreateFMul(Opnd0, F); } else { // (X / C1) * C => X / (C1/C) Constant *F = ConstantExpr::getFDiv(C1, C); - if (isNormalFp(cast<ConstantFP>(F))) + if (isNormalFp(F)) R = BinaryOperator::CreateFDiv(Opnd0, F); } } @@ -406,10 +434,13 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + if (isa<Constant>(Op0)) std::swap(Op0, Op1); - if (Value *V = SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), TD)) + if (Value *V = SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), DL)) return ReplaceInstUsesWith(I, V); bool AllowReassociate = I.hasUnsafeAlgebra(); @@ -425,17 +456,23 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { if (Instruction *NV = FoldOpIntoPhi(I)) return NV; - ConstantFP *C = dyn_cast<ConstantFP>(Op1); - if (C && AllowReassociate && C->getValueAPF().isFiniteNonZero()) { + // (fmul X, -1.0) --> (fsub -0.0, X) + if (match(Op1, m_SpecificFP(-1.0))) { + Constant *NegZero = ConstantFP::getNegativeZero(Op1->getType()); + Instruction *RI = BinaryOperator::CreateFSub(NegZero, Op0); + RI->copyFastMathFlags(&I); + return RI; + } + + Constant *C = cast<Constant>(Op1); + if (AllowReassociate && isFiniteNonZeroFp(C)) { // Let MDC denote an expression in one of these forms: // X * C, C/X, X/C, where C is a constant. // // Try to simplify "MDC * Constant" - if (isFMulOrFDivWithConstant(Op0)) { - Value *V = foldFMulConst(cast<Instruction>(Op0), C, &I); - if (V) + if (isFMulOrFDivWithConstant(Op0)) + if (Value *V = foldFMulConst(cast<Instruction>(Op0), C, &I)) return ReplaceInstUsesWith(I, V); - } // (MDC +/- C1) * C => (MDC * C) +/- (C1 * C) Instruction *FAddSub = dyn_cast<Instruction>(Op0); @@ -444,8 +481,8 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { FAddSub->getOpcode() == Instruction::FSub)) { Value *Opnd0 = FAddSub->getOperand(0); Value *Opnd1 = FAddSub->getOperand(1); - ConstantFP *C0 = dyn_cast<ConstantFP>(Opnd0); - ConstantFP *C1 = dyn_cast<ConstantFP>(Opnd1); + Constant *C0 = dyn_cast<Constant>(Opnd0); + Constant *C1 = dyn_cast<Constant>(Opnd1); bool Swap = false; if (C0) { std::swap(C0, C1); @@ -453,12 +490,11 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { Swap = true; } - if (C1 && C1->getValueAPF().isFiniteNonZero() && - isFMulOrFDivWithConstant(Opnd0)) { + if (C1 && isFiniteNonZeroFp(C1) && isFMulOrFDivWithConstant(Opnd0)) { Value *M1 = ConstantExpr::getFMul(C1, C); - Value *M0 = isNormalFp(cast<ConstantFP>(M1)) ? + Value *M0 = isNormalFp(cast<Constant>(M1)) ? foldFMulConst(cast<Instruction>(Opnd0), C, &I) : - 0; + nullptr; if (M0 && M1) { if (Swap && FAddSub->getOpcode() == Instruction::FSub) std::swap(M0, M1); @@ -478,8 +514,8 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { // Under unsafe algebra do: // X * log2(0.5*Y) = X*log2(Y) - X if (I.hasUnsafeAlgebra()) { - Value *OpX = NULL; - Value *OpY = NULL; + Value *OpX = nullptr; + Value *OpY = nullptr; IntrinsicInst *Log2; detectLog2OfHalf(Op0, OpY, Log2); if (OpY) { @@ -515,8 +551,11 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { Value *N1 = dyn_castFNegVal(Opnd1, IgnoreZeroSign); // -X * -Y => X*Y - if (N1) - return BinaryOperator::CreateFMul(N0, N1); + if (N1) { + Value *FMul = Builder->CreateFMul(N0, N1); + FMul->takeName(&I); + return ReplaceInstUsesWith(I, FMul); + } if (Opnd0->hasOneUse()) { // -X * Y => -(X*Y) (Promote negation as high as possible) @@ -539,7 +578,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { Value *Opnd0_0, *Opnd0_1; if (Opnd0->hasOneUse() && match(Opnd0, m_FMul(m_Value(Opnd0_0), m_Value(Opnd0_1)))) { - Value *Y = 0; + Value *Y = nullptr; if (Opnd0_0 == Opnd1 && Opnd0_1 != Opnd1) Y = Opnd0_1; else if (Opnd0_1 == Opnd1 && Opnd0_0 != Opnd1) @@ -564,7 +603,8 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { if (!match(RHS, m_UIToFP(m_Value(C)))) std::swap(LHS, RHS); - if (match(RHS, m_UIToFP(m_Value(C))) && C->getType()->isIntegerTy(1)) { + if (match(RHS, m_UIToFP(m_Value(C))) && + C->getType()->getScalarType()->isIntegerTy(1)) { B = LHS; Value *Zero = ConstantFP::getNegativeZero(B->getType()); return SelectInst::Create(C, B, Zero); @@ -579,7 +619,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { std::swap(LHS, RHS); if (match(RHS, m_FSub(m_FPOne(), m_UIToFP(m_Value(C)))) && - C->getType()->isIntegerTy(1)) { + C->getType()->getScalarType()->isIntegerTy(1)) { A = LHS; Value *Zero = ConstantFP::getNegativeZero(A->getType()); return SelectInst::Create(C, Zero, A); @@ -592,7 +632,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { break; } - return Changed ? &I : 0; + return Changed ? &I : nullptr; } /// SimplifyDivRemOfSelect - Try to fold a divide or remainder of a select @@ -653,12 +693,12 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) { // If we past the instruction, quit looking for it. if (&*BBI == SI) - SI = 0; + SI = nullptr; if (&*BBI == SelectCond) - SelectCond = 0; + SelectCond = nullptr; // If we ran out of things to eliminate, break out of the loop. - if (SelectCond == 0 && SI == 0) + if (!SelectCond && !SI) break; } @@ -690,7 +730,7 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) { if (Instruction::BinaryOps(LHS->getOpcode()) == I.getOpcode()) if (ConstantInt *LHSRHS = dyn_cast<ConstantInt>(LHS->getOperand(1))) { if (MultiplyOverflows(RHS, LHSRHS, - I.getOpcode()==Instruction::SDiv)) + I.getOpcode() == Instruction::SDiv)) return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType())); return BinaryOperator::Create(I.getOpcode(), LHS->getOperand(0), ConstantExpr::getMul(RHS, LHSRHS)); @@ -706,12 +746,31 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) { } } + if (ConstantInt *One = dyn_cast<ConstantInt>(Op0)) { + if (One->isOne() && !I.getType()->isIntegerTy(1)) { + bool isSigned = I.getOpcode() == Instruction::SDiv; + if (isSigned) { + // If Op1 is 0 then it's undefined behaviour, if Op1 is 1 then the + // result is one, if Op1 is -1 then the result is minus one, otherwise + // it's zero. + Value *Inc = Builder->CreateAdd(Op1, One); + Value *Cmp = Builder->CreateICmpULT( + Inc, ConstantInt::get(I.getType(), 3)); + return SelectInst::Create(Cmp, Op1, ConstantInt::get(I.getType(), 0)); + } else { + // If Op1 is 0 then it's undefined behaviour. If Op1 is 1 then the + // result is one, otherwise it's zero. + return new ZExtInst(Builder->CreateICmpEQ(Op1, One), I.getType()); + } + } + } + // See if we can fold away this div instruction. if (SimplifyDemandedInstructionBits(I)) return &I; // (X - (X rem Y)) / Y -> X / Y; usually originates as ((X / Y) * Y) / Y - Value *X = 0, *Z = 0; + Value *X = nullptr, *Z = nullptr; if (match(Op0, m_Sub(m_Value(X), m_Value(Z)))) { // (X - Z) / Y; Y = Op1 bool isSigned = I.getOpcode() == Instruction::SDiv; if ((isSigned && match(Z, m_SRem(m_Specific(X), m_Specific(Op1)))) || @@ -719,7 +778,7 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) { return BinaryOperator::Create(I.getOpcode(), X, Op1); } - return 0; + return nullptr; } /// dyn_castZExtVal - Checks if V is a zext or constant that can @@ -732,7 +791,7 @@ static Value *dyn_castZExtVal(Value *V, Type *Ty) { if (C->getValue().getActiveBits() <= cast<IntegerType>(Ty)->getBitWidth()) return ConstantExpr::getTrunc(C, Ty); } - return 0; + return nullptr; } namespace { @@ -757,7 +816,7 @@ struct UDivFoldAction { }; UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand) - : FoldAction(FA), OperandToFold(InputOperand), FoldResult(0) {} + : FoldAction(FA), OperandToFold(InputOperand), FoldResult(nullptr) {} UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand, size_t SLHS) : FoldAction(FA), OperandToFold(InputOperand), SelectLHSIdx(SLHS) {} }; @@ -836,7 +895,8 @@ static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I, if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) if (size_t LHSIdx = visitUDivOperand(Op0, SI->getOperand(1), I, Actions)) if (visitUDivOperand(Op0, SI->getOperand(2), I, Actions)) { - Actions.push_back(UDivFoldAction((FoldUDivOperandCb)0, Op1, LHSIdx-1)); + Actions.push_back(UDivFoldAction((FoldUDivOperandCb)nullptr, Op1, + LHSIdx-1)); return Actions.size(); } @@ -846,7 +906,10 @@ static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I, Instruction *InstCombiner::visitUDiv(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (Value *V = SimplifyUDivInst(Op0, Op1, TD)) + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + + if (Value *V = SimplifyUDivInst(Op0, Op1, DL)) return ReplaceInstUsesWith(I, V); // Handle the integer div common cases @@ -854,13 +917,11 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) { return Common; // (x lshr C1) udiv C2 --> x udiv (C2 << C1) - if (ConstantInt *C2 = dyn_cast<ConstantInt>(Op1)) { + if (Constant *C2 = dyn_cast<Constant>(Op1)) { Value *X; - ConstantInt *C1; - if (match(Op0, m_LShr(m_Value(X), m_ConstantInt(C1)))) { - APInt NC = C2->getValue().shl(C1->getLimitedValue(C1->getBitWidth()-1)); - return BinaryOperator::CreateUDiv(X, Builder->getInt(NC)); - } + Constant *C1; + if (match(Op0, m_LShr(m_Value(X), m_Constant(C1)))) + return BinaryOperator::CreateUDiv(X, ConstantExpr::getShl(C2, C1)); } // (zext A) udiv (zext B) --> zext (A udiv B) @@ -901,24 +962,27 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) { return Inst; } - return 0; + return nullptr; } Instruction *InstCombiner::visitSDiv(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (Value *V = SimplifySDivInst(Op0, Op1, TD)) + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + + if (Value *V = SimplifySDivInst(Op0, Op1, DL)) return ReplaceInstUsesWith(I, V); // Handle the integer div common cases if (Instruction *Common = commonIDivTransforms(I)) return Common; - if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { - // sdiv X, -1 == -X - if (RHS->isAllOnesValue()) - return BinaryOperator::CreateNeg(Op0); + // sdiv X, -1 == -X + if (match(Op1, m_AllOnes())) + return BinaryOperator::CreateNeg(Op0); + if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { // sdiv X, C --> ashr exact X, log2(C) if (I.isExact() && RHS->getValue().isNonNegative() && RHS->getValue().isPowerOf2()) { @@ -926,6 +990,12 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) { RHS->getValue().exactLogBase2()); return BinaryOperator::CreateExactAShr(Op0, ShAmt, I.getName()); } + } + + if (Constant *RHS = dyn_cast<Constant>(Op1)) { + // X/INT_MIN -> X == INT_MIN + if (RHS->isMinSignedValue()) + return new ZExtInst(Builder->CreateICmpEQ(Op0, Op1), I.getType()); // -X/C --> X/-C provided the negation doesn't overflow. if (SubOperator *Sub = dyn_cast<SubOperator>(Op0)) @@ -954,7 +1024,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) { } } - return 0; + return nullptr; } /// CvtFDivConstToReciprocal tries to convert X/C into X*1/C if C not a special @@ -965,9 +1035,12 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) { /// returned; otherwise, NULL is returned. /// static Instruction *CvtFDivConstToReciprocal(Value *Dividend, - ConstantFP *Divisor, + Constant *Divisor, bool AllowReciprocal) { - const APFloat &FpVal = Divisor->getValueAPF(); + if (!isa<ConstantFP>(Divisor)) // TODO: handle vectors. + return nullptr; + + const APFloat &FpVal = cast<ConstantFP>(Divisor)->getValueAPF(); APFloat Reciprocal(FpVal.getSemantics()); bool Cvt = FpVal.getExactInverse(&Reciprocal); @@ -978,7 +1051,7 @@ static Instruction *CvtFDivConstToReciprocal(Value *Dividend, } if (!Cvt) - return 0; + return nullptr; ConstantFP *R; R = ConstantFP::get(Dividend->getType()->getContext(), Reciprocal); @@ -988,7 +1061,10 @@ static Instruction *CvtFDivConstToReciprocal(Value *Dividend, Instruction *InstCombiner::visitFDiv(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (Value *V = SimplifyFDivInst(Op0, Op1, TD)) + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + + if (Value *V = SimplifyFDivInst(Op0, Op1, DL)) return ReplaceInstUsesWith(I, V); if (isa<Constant>(Op0)) @@ -999,32 +1075,29 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) { bool AllowReassociate = I.hasUnsafeAlgebra(); bool AllowReciprocal = I.hasAllowReciprocal(); - if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) { + if (Constant *Op1C = dyn_cast<Constant>(Op1)) { if (SelectInst *SI = dyn_cast<SelectInst>(Op0)) if (Instruction *R = FoldOpIntoSelect(I, SI)) return R; if (AllowReassociate) { - ConstantFP *C1 = 0; - ConstantFP *C2 = Op1C; + Constant *C1 = nullptr; + Constant *C2 = Op1C; Value *X; - Instruction *Res = 0; + Instruction *Res = nullptr; - if (match(Op0, m_FMul(m_Value(X), m_ConstantFP(C1)))) { + if (match(Op0, m_FMul(m_Value(X), m_Constant(C1)))) { // (X*C1)/C2 => X * (C1/C2) // Constant *C = ConstantExpr::getFDiv(C1, C2); - const APFloat &F = cast<ConstantFP>(C)->getValueAPF(); - if (F.isNormal()) + if (isNormalFp(C)) Res = BinaryOperator::CreateFMul(X, C); - } else if (match(Op0, m_FDiv(m_Value(X), m_ConstantFP(C1)))) { + } else if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) { // (X/C1)/C2 => X /(C2*C1) [=> X * 1/(C2*C1) if reciprocal is allowed] // Constant *C = ConstantExpr::getFMul(C1, C2); - const APFloat &F = cast<ConstantFP>(C)->getValueAPF(); - if (F.isNormal()) { - Res = CvtFDivConstToReciprocal(X, cast<ConstantFP>(C), - AllowReciprocal); + if (isNormalFp(C)) { + Res = CvtFDivConstToReciprocal(X, C, AllowReciprocal); if (!Res) Res = BinaryOperator::CreateFDiv(X, C); } @@ -1037,60 +1110,68 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) { } // X / C => X * 1/C - if (Instruction *T = CvtFDivConstToReciprocal(Op0, Op1C, AllowReciprocal)) + if (Instruction *T = CvtFDivConstToReciprocal(Op0, Op1C, AllowReciprocal)) { + T->copyFastMathFlags(&I); return T; + } - return 0; + return nullptr; } - if (AllowReassociate && isa<ConstantFP>(Op0)) { - ConstantFP *C1 = cast<ConstantFP>(Op0), *C2; - Constant *Fold = 0; + if (AllowReassociate && isa<Constant>(Op0)) { + Constant *C1 = cast<Constant>(Op0), *C2; + Constant *Fold = nullptr; Value *X; bool CreateDiv = true; // C1 / (X*C2) => (C1/C2) / X - if (match(Op1, m_FMul(m_Value(X), m_ConstantFP(C2)))) + if (match(Op1, m_FMul(m_Value(X), m_Constant(C2)))) Fold = ConstantExpr::getFDiv(C1, C2); - else if (match(Op1, m_FDiv(m_Value(X), m_ConstantFP(C2)))) { + else if (match(Op1, m_FDiv(m_Value(X), m_Constant(C2)))) { // C1 / (X/C2) => (C1*C2) / X Fold = ConstantExpr::getFMul(C1, C2); - } else if (match(Op1, m_FDiv(m_ConstantFP(C2), m_Value(X)))) { + } else if (match(Op1, m_FDiv(m_Constant(C2), m_Value(X)))) { // C1 / (C2/X) => (C1/C2) * X Fold = ConstantExpr::getFDiv(C1, C2); CreateDiv = false; } - if (Fold) { - const APFloat &FoldC = cast<ConstantFP>(Fold)->getValueAPF(); - if (FoldC.isNormal()) { - Instruction *R = CreateDiv ? - BinaryOperator::CreateFDiv(Fold, X) : - BinaryOperator::CreateFMul(X, Fold); - R->setFastMathFlags(I.getFastMathFlags()); - return R; - } + if (Fold && isNormalFp(Fold)) { + Instruction *R = CreateDiv ? BinaryOperator::CreateFDiv(Fold, X) + : BinaryOperator::CreateFMul(X, Fold); + R->setFastMathFlags(I.getFastMathFlags()); + return R; } - return 0; + return nullptr; } if (AllowReassociate) { Value *X, *Y; - Value *NewInst = 0; - Instruction *SimpR = 0; + Value *NewInst = nullptr; + Instruction *SimpR = nullptr; if (Op0->hasOneUse() && match(Op0, m_FDiv(m_Value(X), m_Value(Y)))) { // (X/Y) / Z => X / (Y*Z) // - if (!isa<ConstantFP>(Y) || !isa<ConstantFP>(Op1)) { + if (!isa<Constant>(Y) || !isa<Constant>(Op1)) { NewInst = Builder->CreateFMul(Y, Op1); + if (Instruction *RI = dyn_cast<Instruction>(NewInst)) { + FastMathFlags Flags = I.getFastMathFlags(); + Flags &= cast<Instruction>(Op0)->getFastMathFlags(); + RI->setFastMathFlags(Flags); + } SimpR = BinaryOperator::CreateFDiv(X, NewInst); } } else if (Op1->hasOneUse() && match(Op1, m_FDiv(m_Value(X), m_Value(Y)))) { // Z / (X/Y) => Z*Y / X // - if (!isa<ConstantFP>(Y) || !isa<ConstantFP>(Op0)) { + if (!isa<Constant>(Y) || !isa<Constant>(Op0)) { NewInst = Builder->CreateFMul(Op0, Y); + if (Instruction *RI = dyn_cast<Instruction>(NewInst)) { + FastMathFlags Flags = I.getFastMathFlags(); + Flags &= cast<Instruction>(Op1)->getFastMathFlags(); + RI->setFastMathFlags(Flags); + } SimpR = BinaryOperator::CreateFDiv(NewInst, X); } } @@ -1103,7 +1184,7 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) { } } - return 0; + return nullptr; } /// This function implements the transforms common to both integer remainder @@ -1123,7 +1204,7 @@ Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) { if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I)) return &I; - if (isa<ConstantInt>(Op1)) { + if (isa<Constant>(Op1)) { if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) { if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) { if (Instruction *R = FoldOpIntoSelect(I, SI)) @@ -1139,13 +1220,16 @@ Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) { } } - return 0; + return nullptr; } Instruction *InstCombiner::visitURem(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (Value *V = SimplifyURemInst(Op0, Op1, TD)) + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + + if (Value *V = SimplifyURemInst(Op0, Op1, DL)) return ReplaceInstUsesWith(I, V); if (Instruction *common = commonIRemTransforms(I)) @@ -1171,13 +1255,16 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) { return ReplaceInstUsesWith(I, Ext); } - return 0; + return nullptr; } Instruction *InstCombiner::visitSRem(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (Value *V = SimplifySRemInst(Op0, Op1, TD)) + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + + if (Value *V = SimplifySRemInst(Op0, Op1, DL)) return ReplaceInstUsesWith(I, V); // Handle the integer rem common cases @@ -1213,7 +1300,7 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) { bool hasMissing = false; for (unsigned i = 0; i != VWidth; ++i) { Constant *Elt = C->getAggregateElement(i); - if (Elt == 0) { + if (!Elt) { hasMissing = true; break; } @@ -1242,18 +1329,21 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) { } } - return 0; + return nullptr; } Instruction *InstCombiner::visitFRem(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (Value *V = SimplifyFRemInst(Op0, Op1, TD)) + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + + if (Value *V = SimplifyFRemInst(Op0, Op1, DL)) return ReplaceInstUsesWith(I, V); // Handle cases involving: rem X, (select Cond, Y, Z) if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I)) return &I; - return 0; + return nullptr; } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 4c6d0c4..46f7b8a 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -18,6 +18,8 @@ #include "llvm/IR/DataLayout.h" using namespace llvm; +#define DEBUG_TYPE "instcombine" + /// FoldPHIArgBinOpIntoPHI - If we have something like phi [add (a,b), add(a,c)] /// and if a/b/c and the add's all have a single use, turn this into a phi /// and a single binop. @@ -48,12 +50,12 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) { // types. I->getOperand(0)->getType() != LHSType || I->getOperand(1)->getType() != RHSType) - return 0; + return nullptr; // If they are CmpInst instructions, check their predicates if (CmpInst *CI = dyn_cast<CmpInst>(I)) if (CI->getPredicate() != cast<CmpInst>(FirstInst)->getPredicate()) - return 0; + return nullptr; if (isNUW) isNUW = cast<OverflowingBinaryOperator>(I)->hasNoUnsignedWrap(); @@ -63,8 +65,8 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) { isExact = cast<PossiblyExactOperator>(I)->isExact(); // Keep track of which operand needs a phi node. - if (I->getOperand(0) != LHSVal) LHSVal = 0; - if (I->getOperand(1) != RHSVal) RHSVal = 0; + if (I->getOperand(0) != LHSVal) LHSVal = nullptr; + if (I->getOperand(1) != RHSVal) RHSVal = nullptr; } // If both LHS and RHS would need a PHI, don't do this transformation, @@ -72,14 +74,14 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) { // which leads to higher register pressure. This is especially // bad when the PHIs are in the header of a loop. if (!LHSVal && !RHSVal) - return 0; + return nullptr; // Otherwise, this is safe to transform! Value *InLHS = FirstInst->getOperand(0); Value *InRHS = FirstInst->getOperand(1); - PHINode *NewLHS = 0, *NewRHS = 0; - if (LHSVal == 0) { + PHINode *NewLHS = nullptr, *NewRHS = nullptr; + if (!LHSVal) { NewLHS = PHINode::Create(LHSType, PN.getNumIncomingValues(), FirstInst->getOperand(0)->getName() + ".pn"); NewLHS->addIncoming(InLHS, PN.getIncomingBlock(0)); @@ -87,7 +89,7 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) { LHSVal = NewLHS; } - if (RHSVal == 0) { + if (!RHSVal) { NewRHS = PHINode::Create(RHSType, PN.getNumIncomingValues(), FirstInst->getOperand(1)->getName() + ".pn"); NewRHS->addIncoming(InRHS, PN.getIncomingBlock(0)); @@ -148,7 +150,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) { GetElementPtrInst *GEP= dyn_cast<GetElementPtrInst>(PN.getIncomingValue(i)); if (!GEP || !GEP->hasOneUse() || GEP->getType() != FirstInst->getType() || GEP->getNumOperands() != FirstInst->getNumOperands()) - return 0; + return nullptr; AllInBounds &= GEP->isInBounds(); @@ -170,19 +172,19 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) { // for struct indices, which must always be constant. if (isa<ConstantInt>(FirstInst->getOperand(op)) || isa<ConstantInt>(GEP->getOperand(op))) - return 0; + return nullptr; if (FirstInst->getOperand(op)->getType() !=GEP->getOperand(op)->getType()) - return 0; + return nullptr; // If we already needed a PHI for an earlier operand, and another operand // also requires a PHI, we'd be introducing more PHIs than we're // eliminating, which increases register pressure on entry to the PHI's // block. if (NeededPhi) - return 0; + return nullptr; - FixedOperands[op] = 0; // Needs a PHI. + FixedOperands[op] = nullptr; // Needs a PHI. NeededPhi = true; } } @@ -194,7 +196,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) { // load up into the predecessors so that we have a load of a gep of an alloca, // which can usually all be folded into the load. if (AllBasePointersAreAllocas) - return 0; + return nullptr; // Otherwise, this is safe to transform. Insert PHI nodes for each operand // that is variable. @@ -255,9 +257,7 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) { // profitable to do this xform. if (AllocaInst *AI = dyn_cast<AllocaInst>(L->getOperand(0))) { bool isAddressTaken = false; - for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); - UI != E; ++UI) { - User *U = *UI; + for (User *U : AI->users()) { if (isa<LoadInst>(U)) continue; if (StoreInst *SI = dyn_cast<StoreInst>(U)) { // If storing TO the alloca, then the address isn't taken. @@ -290,7 +290,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { // FIXME: This is overconservative; this transform is allowed in some cases // for atomic operations. if (FirstLI->isAtomic()) - return 0; + return nullptr; // When processing loads, we need to propagate two bits of information to the // sunk load: whether it is volatile, and what its alignment is. We currently @@ -305,20 +305,20 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { // load and the PHI. if (FirstLI->getParent() != PN.getIncomingBlock(0) || !isSafeAndProfitableToSinkLoad(FirstLI)) - return 0; + return nullptr; // If the PHI is of volatile loads and the load block has multiple // successors, sinking it would remove a load of the volatile value from // the path through the other successor. if (isVolatile && FirstLI->getParent()->getTerminator()->getNumSuccessors() != 1) - return 0; + return nullptr; // Check to see if all arguments are the same operation. for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { LoadInst *LI = dyn_cast<LoadInst>(PN.getIncomingValue(i)); if (!LI || !LI->hasOneUse()) - return 0; + return nullptr; // We can't sink the load if the loaded value could be modified between // the load and the PHI. @@ -326,12 +326,12 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { LI->getParent() != PN.getIncomingBlock(i) || LI->getPointerAddressSpace() != LoadAddrSpace || !isSafeAndProfitableToSinkLoad(LI)) - return 0; + return nullptr; // If some of the loads have an alignment specified but not all of them, // we can't do the transformation. if ((LoadAlignment != 0) != (LI->getAlignment() != 0)) - return 0; + return nullptr; LoadAlignment = std::min(LoadAlignment, LI->getAlignment()); @@ -340,7 +340,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { // the path through the other successor. if (isVolatile && LI->getParent()->getTerminator()->getNumSuccessors() != 1) - return 0; + return nullptr; } // Okay, they are all the same operation. Create a new PHI node of the @@ -356,7 +356,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { Value *NewInVal = cast<LoadInst>(PN.getIncomingValue(i))->getOperand(0); if (NewInVal != InVal) - InVal = 0; + InVal = nullptr; NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i)); } @@ -400,8 +400,8 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) { // If all input operands to the phi are the same instruction (e.g. a cast from // the same type or "+42") we can pull the operation through the PHI, reducing // code size and simplifying code. - Constant *ConstantOp = 0; - Type *CastSrcTy = 0; + Constant *ConstantOp = nullptr; + Type *CastSrcTy = nullptr; bool isNUW = false, isNSW = false, isExact = false; if (isa<CastInst>(FirstInst)) { @@ -411,13 +411,13 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) { // the code by turning an i32 into an i1293. if (PN.getType()->isIntegerTy() && CastSrcTy->isIntegerTy()) { if (!ShouldChangeType(PN.getType(), CastSrcTy)) - return 0; + return nullptr; } } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) { // Can fold binop, compare or shift here if the RHS is a constant, // otherwise call FoldPHIArgBinOpIntoPHI. ConstantOp = dyn_cast<Constant>(FirstInst->getOperand(1)); - if (ConstantOp == 0) + if (!ConstantOp) return FoldPHIArgBinOpIntoPHI(PN); if (OverflowingBinaryOperator *BO = @@ -428,19 +428,19 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) { dyn_cast<PossiblyExactOperator>(FirstInst)) isExact = PEO->isExact(); } else { - return 0; // Cannot fold this operation. + return nullptr; // Cannot fold this operation. } // Check to see if all arguments are the same operation. for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i)); - if (I == 0 || !I->hasOneUse() || !I->isSameOperationAs(FirstInst)) - return 0; + if (!I || !I->hasOneUse() || !I->isSameOperationAs(FirstInst)) + return nullptr; if (CastSrcTy) { if (I->getOperand(0)->getType() != CastSrcTy) - return 0; // Cast operation must match. + return nullptr; // Cast operation must match. } else if (I->getOperand(1) != ConstantOp) { - return 0; + return nullptr; } if (isNUW) @@ -464,7 +464,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) { for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { Value *NewInVal = cast<Instruction>(PN.getIncomingValue(i))->getOperand(0); if (NewInVal != InVal) - InVal = 0; + InVal = nullptr; NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i)); } @@ -518,7 +518,7 @@ static bool DeadPHICycle(PHINode *PN, if (PotentiallyDeadPHIs.size() == 16) return false; - if (PHINode *PU = dyn_cast<PHINode>(PN->use_back())) + if (PHINode *PU = dyn_cast<PHINode>(PN->user_back())) return DeadPHICycle(PU, PotentiallyDeadPHIs); return false; @@ -589,10 +589,10 @@ namespace llvm { template<> struct DenseMapInfo<LoweredPHIRecord> { static inline LoweredPHIRecord getEmptyKey() { - return LoweredPHIRecord(0, 0); + return LoweredPHIRecord(nullptr, 0); } static inline LoweredPHIRecord getTombstoneKey() { - return LoweredPHIRecord(0, 1); + return LoweredPHIRecord(nullptr, 1); } static unsigned getHashValue(const LoweredPHIRecord &Val) { return DenseMapInfo<PHINode*>::getHashValue(Val.PN) ^ (Val.Shift>>3) ^ @@ -639,42 +639,40 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { // bail out. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { InvokeInst *II = dyn_cast<InvokeInst>(PN->getIncomingValue(i)); - if (II == 0) continue; + if (!II) continue; if (II->getParent() != PN->getIncomingBlock(i)) continue; // If we have a phi, and if it's directly in the predecessor, then we have // a critical edge where we need to put the truncate. Since we can't // split the edge in instcombine, we have to bail out. - return 0; + return nullptr; } - - for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end(); - UI != E; ++UI) { - Instruction *User = cast<Instruction>(*UI); + for (User *U : PN->users()) { + Instruction *UserI = cast<Instruction>(U); // If the user is a PHI, inspect its uses recursively. - if (PHINode *UserPN = dyn_cast<PHINode>(User)) { + if (PHINode *UserPN = dyn_cast<PHINode>(UserI)) { if (PHIsInspected.insert(UserPN)) PHIsToSlice.push_back(UserPN); continue; } // Truncates are always ok. - if (isa<TruncInst>(User)) { - PHIUsers.push_back(PHIUsageRecord(PHIId, 0, User)); + if (isa<TruncInst>(UserI)) { + PHIUsers.push_back(PHIUsageRecord(PHIId, 0, UserI)); continue; } // Otherwise it must be a lshr which can only be used by one trunc. - if (User->getOpcode() != Instruction::LShr || - !User->hasOneUse() || !isa<TruncInst>(User->use_back()) || - !isa<ConstantInt>(User->getOperand(1))) - return 0; + if (UserI->getOpcode() != Instruction::LShr || + !UserI->hasOneUse() || !isa<TruncInst>(UserI->user_back()) || + !isa<ConstantInt>(UserI->getOperand(1))) + return nullptr; - unsigned Shift = cast<ConstantInt>(User->getOperand(1))->getZExtValue(); - PHIUsers.push_back(PHIUsageRecord(PHIId, Shift, User->use_back())); + unsigned Shift = cast<ConstantInt>(UserI->getOperand(1))->getZExtValue(); + PHIUsers.push_back(PHIUsageRecord(PHIId, Shift, UserI->user_back())); } } @@ -709,7 +707,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { // If we've already lowered a user like this, reuse the previously lowered // value. - if ((EltPHI = ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)]) == 0) { + if ((EltPHI = ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)]) == nullptr) { // Otherwise, Create the new PHI node for this user. EltPHI = PHINode::Create(Ty, PN->getNumIncomingValues(), @@ -790,7 +788,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { // PHINode simplification // Instruction *InstCombiner::visitPHINode(PHINode &PN) { - if (Value *V = SimplifyInstruction(&PN, TD, TLI)) + if (Value *V = SimplifyInstruction(&PN, DL, TLI)) return ReplaceInstUsesWith(PN, V); // If all PHI operands are the same operation, pull them through the PHI, @@ -809,7 +807,7 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) { // this PHI only has a single use (a PHI), and if that PHI only has one use (a // PHI)... break the cycle. if (PN.hasOneUse()) { - Instruction *PHIUser = cast<Instruction>(PN.use_back()); + Instruction *PHIUser = cast<Instruction>(PN.user_back()); if (PHINode *PU = dyn_cast<PHINode>(PHIUser)) { SmallPtrSet<PHINode*, 16> PotentiallyDeadPHIs; PotentiallyDeadPHIs.insert(&PN); @@ -825,7 +823,7 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) { // late. if (PHIUser->hasOneUse() && (isa<BinaryOperator>(PHIUser) || isa<GetElementPtrInst>(PHIUser)) && - PHIUser->use_back() == &PN) { + PHIUser->user_back() == &PN) { return ReplaceInstUsesWith(PN, UndefValue::get(PN.getType())); } } @@ -893,10 +891,10 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) { // it is only used by trunc or trunc(lshr) operations. If so, we split the // PHI into the various pieces being extracted. This sort of thing is // introduced when SROA promotes an aggregate to a single large integer type. - if (PN.getType()->isIntegerTy() && TD && - !TD->isLegalInteger(PN.getType()->getPrimitiveSizeInBits())) + if (PN.getType()->isIntegerTy() && DL && + !DL->isLegalInteger(PN.getType()->getPrimitiveSizeInBits())) if (Instruction *Res = SliceUpIllegalIntegerPHI(PN)) return Res; - return 0; + return nullptr; } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 283bec2..a36cbe6 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -14,28 +14,35 @@ #include "InstCombine.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Support/PatternMatch.h" +#include "llvm/IR/PatternMatch.h" using namespace llvm; using namespace PatternMatch; +#define DEBUG_TYPE "instcombine" + /// MatchSelectPattern - Pattern match integer [SU]MIN, [SU]MAX, and ABS idioms, /// returning the kind and providing the out parameter results if we /// successfully match. static SelectPatternFlavor MatchSelectPattern(Value *V, Value *&LHS, Value *&RHS) { SelectInst *SI = dyn_cast<SelectInst>(V); - if (SI == 0) return SPF_UNKNOWN; + if (!SI) return SPF_UNKNOWN; ICmpInst *ICI = dyn_cast<ICmpInst>(SI->getCondition()); - if (ICI == 0) return SPF_UNKNOWN; + if (!ICI) return SPF_UNKNOWN; + + ICmpInst::Predicate Pred = ICI->getPredicate(); + Value *CmpLHS = ICI->getOperand(0); + Value *CmpRHS = ICI->getOperand(1); + Value *TrueVal = SI->getTrueValue(); + Value *FalseVal = SI->getFalseValue(); - LHS = ICI->getOperand(0); - RHS = ICI->getOperand(1); + LHS = CmpLHS; + RHS = CmpRHS; // (icmp X, Y) ? X : Y - if (SI->getTrueValue() == ICI->getOperand(0) && - SI->getFalseValue() == ICI->getOperand(1)) { - switch (ICI->getPredicate()) { + if (TrueVal == CmpLHS && FalseVal == CmpRHS) { + switch (Pred) { default: return SPF_UNKNOWN; // Equality. case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_UGE: return SPF_UMAX; @@ -49,18 +56,35 @@ MatchSelectPattern(Value *V, Value *&LHS, Value *&RHS) { } // (icmp X, Y) ? Y : X - if (SI->getTrueValue() == ICI->getOperand(1) && - SI->getFalseValue() == ICI->getOperand(0)) { - switch (ICI->getPredicate()) { - default: return SPF_UNKNOWN; // Equality. - case ICmpInst::ICMP_UGT: - case ICmpInst::ICMP_UGE: return SPF_UMIN; - case ICmpInst::ICMP_SGT: - case ICmpInst::ICMP_SGE: return SPF_SMIN; - case ICmpInst::ICMP_ULT: - case ICmpInst::ICMP_ULE: return SPF_UMAX; - case ICmpInst::ICMP_SLT: - case ICmpInst::ICMP_SLE: return SPF_SMAX; + if (TrueVal == CmpRHS && FalseVal == CmpLHS) { + switch (Pred) { + default: return SPF_UNKNOWN; // Equality. + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_UGE: return SPF_UMIN; + case ICmpInst::ICMP_SGT: + case ICmpInst::ICMP_SGE: return SPF_SMIN; + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_ULE: return SPF_UMAX; + case ICmpInst::ICMP_SLT: + case ICmpInst::ICMP_SLE: return SPF_SMAX; + } + } + + if (ConstantInt *C1 = dyn_cast<ConstantInt>(CmpRHS)) { + if ((CmpLHS == TrueVal && match(FalseVal, m_Neg(m_Specific(CmpLHS)))) || + (CmpLHS == FalseVal && match(TrueVal, m_Neg(m_Specific(CmpLHS))))) { + + // ABS(X) ==> (X >s 0) ? X : -X and (X >s -1) ? X : -X + // NABS(X) ==> (X >s 0) ? -X : X and (X >s -1) ? -X : X + if (Pred == ICmpInst::ICMP_SGT && (C1->isZero() || C1->isMinusOne())) { + return (CmpLHS == TrueVal) ? SPF_ABS : SPF_NABS; + } + + // ABS(X) ==> (X <s 0) ? -X : X and (X <s 1) ? -X : X + // NABS(X) ==> (X <s 0) ? X : -X and (X <s 1) ? X : -X + if (Pred == ICmpInst::ICMP_SLT && (C1->isZero() || C1->isOne())) { + return (CmpLHS == FalseVal) ? SPF_ABS : SPF_NABS; + } } } @@ -129,15 +153,15 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI, if (TI->isCast()) { Type *FIOpndTy = FI->getOperand(0)->getType(); if (TI->getOperand(0)->getType() != FIOpndTy) - return 0; + return nullptr; // The select condition may be a vector. We may only change the operand // type if the vector width remains the same (and matches the condition). Type *CondTy = SI.getCondition()->getType(); if (CondTy->isVectorTy() && (!FIOpndTy->isVectorTy() || CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements())) - return 0; + return nullptr; } else { - return 0; // unknown unary op. + return nullptr; // unknown unary op. } // Fold this by inserting a select from the input values. @@ -149,7 +173,7 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI, // Only handle binary operators here. if (!isa<BinaryOperator>(TI)) - return 0; + return nullptr; // Figure out if the operations have any operands in common. Value *MatchOp, *OtherOpT, *OtherOpF; @@ -165,7 +189,7 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI, OtherOpF = FI->getOperand(0); MatchIsOpZero = false; } else if (!TI->isCommutative()) { - return 0; + return nullptr; } else if (TI->getOperand(0) == FI->getOperand(1)) { MatchOp = TI->getOperand(0); OtherOpT = TI->getOperand(1); @@ -177,7 +201,7 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI, OtherOpF = FI->getOperand(1); MatchIsOpZero = true; } else { - return 0; + return nullptr; } // If we reach here, they do have operations in common. @@ -282,7 +306,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal, } } - return 0; + return nullptr; } /// SimplifyWithOpReplaced - See if V simplifies when its operand Op is @@ -296,7 +320,7 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, Instruction *I = dyn_cast<Instruction>(V); if (!I) - return 0; + return nullptr; // If this is a binary operator, try to simplify it with the replaced op. if (BinaryOperator *B = dyn_cast<BinaryOperator>(I)) { @@ -347,7 +371,7 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, } } - return 0; + return nullptr; } /// foldSelectICmpAndOr - We want to turn: @@ -368,18 +392,18 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal, InstCombiner::BuilderTy *Builder) { const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition()); if (!IC || !IC->isEquality() || !SI.getType()->isIntegerTy()) - return 0; + return nullptr; Value *CmpLHS = IC->getOperand(0); Value *CmpRHS = IC->getOperand(1); if (!match(CmpRHS, m_Zero())) - return 0; + return nullptr; Value *X; const APInt *C1; if (!match(CmpLHS, m_And(m_Value(X), m_Power2(C1)))) - return 0; + return nullptr; const APInt *C2; bool OrOnTrueVal = false; @@ -388,7 +412,7 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal, OrOnTrueVal = match(TrueVal, m_Or(m_Specific(FalseVal), m_Power2(C2))); if (!OrOnFalseVal && !OrOnTrueVal) - return 0; + return nullptr; Value *V = CmpLHS; Value *Y = OrOnFalseVal ? TrueVal : FalseVal; @@ -527,7 +551,7 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, if (IntegerType *Ty = dyn_cast<IntegerType>(CmpLHS->getType())) { if (TrueVal->getType() == Ty) { if (ConstantInt *Cmp = dyn_cast<ConstantInt>(CmpRHS)) { - ConstantInt *C1 = NULL, *C2 = NULL; + ConstantInt *C1 = nullptr, *C2 = nullptr; if (Pred == ICmpInst::ICMP_SGT && Cmp->isAllOnesValue()) { C1 = dyn_cast<ConstantInt>(TrueVal); C2 = dyn_cast<ConstantInt>(FalseVal); @@ -554,18 +578,18 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, // arms of the select. See if substituting this value into the arm and // simplifying the result yields the same value as the other arm. if (Pred == ICmpInst::ICMP_EQ) { - if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TD, TLI) == TrueVal || - SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TD, TLI) == TrueVal) + if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI) == TrueVal || + SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI) == TrueVal) return ReplaceInstUsesWith(SI, FalseVal); - if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TD, TLI) == FalseVal || - SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TD, TLI) == FalseVal) + if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI) == FalseVal || + SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI) == FalseVal) return ReplaceInstUsesWith(SI, FalseVal); } else if (Pred == ICmpInst::ICMP_NE) { - if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TD, TLI) == FalseVal || - SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TD, TLI) == FalseVal) + if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI) == FalseVal || + SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI) == FalseVal) return ReplaceInstUsesWith(SI, TrueVal); - if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TD, TLI) == TrueVal || - SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TD, TLI) == TrueVal) + if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI) == TrueVal || + SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI) == TrueVal) return ReplaceInstUsesWith(SI, TrueVal); } @@ -586,7 +610,7 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, if (Value *V = foldSelectICmpAndOr(SI, TrueVal, FalseVal, Builder)) return ReplaceInstUsesWith(SI, V); - return Changed ? &SI : 0; + return Changed ? &SI : nullptr; } @@ -606,7 +630,7 @@ static bool CanSelectOperandBeMappingIntoPredBlock(const Value *V, // If the value is a non-instruction value like a constant or argument, it // can always be mapped. const Instruction *I = dyn_cast<Instruction>(V); - if (I == 0) return true; + if (!I) return true; // If V is a PHI node defined in the same block as the condition PHI, we can // map the arguments. @@ -649,10 +673,50 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner, return ReplaceInstUsesWith(Outer, C); } - // TODO: MIN(MIN(A, 23), 97) - return 0; -} + if (SPF1 == SPF2) { + if (ConstantInt *CB = dyn_cast<ConstantInt>(B)) { + if (ConstantInt *CC = dyn_cast<ConstantInt>(C)) { + APInt ACB = CB->getValue(); + APInt ACC = CC->getValue(); + + // MIN(MIN(A, 23), 97) -> MIN(A, 23) + // MAX(MAX(A, 97), 23) -> MAX(A, 97) + if ((SPF1 == SPF_UMIN && ACB.ule(ACC)) || + (SPF1 == SPF_SMIN && ACB.sle(ACC)) || + (SPF1 == SPF_UMAX && ACB.uge(ACC)) || + (SPF1 == SPF_SMAX && ACB.sge(ACC))) + return ReplaceInstUsesWith(Outer, Inner); + + // MIN(MIN(A, 97), 23) -> MIN(A, 23) + // MAX(MAX(A, 23), 97) -> MAX(A, 97) + if ((SPF1 == SPF_UMIN && ACB.ugt(ACC)) || + (SPF1 == SPF_SMIN && ACB.sgt(ACC)) || + (SPF1 == SPF_UMAX && ACB.ult(ACC)) || + (SPF1 == SPF_SMAX && ACB.slt(ACC))) { + Outer.replaceUsesOfWith(Inner, A); + return &Outer; + } + } + } + } + + // ABS(ABS(X)) -> ABS(X) + // NABS(NABS(X)) -> NABS(X) + if (SPF1 == SPF2 && (SPF1 == SPF_ABS || SPF1 == SPF_NABS)) { + return ReplaceInstUsesWith(Outer, Inner); + } + // ABS(NABS(X)) -> ABS(X) + // NABS(ABS(X)) -> NABS(X) + if ((SPF1 == SPF_ABS && SPF2 == SPF_NABS) || + (SPF1 == SPF_NABS && SPF2 == SPF_ABS)) { + SelectInst *SI = cast<SelectInst>(Inner); + Value *NewSI = Builder->CreateSelect( + SI->getCondition(), SI->getFalseValue(), SI->getTrueValue()); + return ReplaceInstUsesWith(Outer, NewSI); + } + return nullptr; +} /// foldSelectICmpAnd - If one of the constants is zero (we know they can't /// both be) and we have an icmp instruction with zero, and we have an 'and' @@ -663,27 +727,27 @@ static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal, InstCombiner::BuilderTy *Builder) { const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition()); if (!IC || !IC->isEquality() || !SI.getType()->isIntegerTy()) - return 0; + return nullptr; if (!match(IC->getOperand(1), m_Zero())) - return 0; + return nullptr; ConstantInt *AndRHS; Value *LHS = IC->getOperand(0); if (!match(LHS, m_And(m_Value(), m_ConstantInt(AndRHS)))) - return 0; + return nullptr; // If both select arms are non-zero see if we have a select of the form // 'x ? 2^n + C : C'. Then we can offset both arms by C, use the logic // for 'x ? 2^n : 0' and fix the thing up at the end. - ConstantInt *Offset = 0; + ConstantInt *Offset = nullptr; if (!TrueVal->isZero() && !FalseVal->isZero()) { if ((TrueVal->getValue() - FalseVal->getValue()).isPowerOf2()) Offset = FalseVal; else if ((FalseVal->getValue() - TrueVal->getValue()).isPowerOf2()) Offset = TrueVal; else - return 0; + return nullptr; // Adjust TrueVal and FalseVal to the offset. TrueVal = ConstantInt::get(Builder->getContext(), @@ -696,7 +760,7 @@ static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal, if (!AndRHS->getValue().isPowerOf2() || (!TrueVal->getValue().isPowerOf2() && !FalseVal->getValue().isPowerOf2())) - return 0; + return nullptr; // Determine which shift is needed to transform result of the 'and' into the // desired result. @@ -708,7 +772,7 @@ static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal, // or a trunc of the 'and'. The trunc case requires that all of the truncated // bits are zero, we can figure that out by looking at the 'and' mask. if (AndZeros >= ValC->getBitWidth()) - return 0; + return nullptr; Value *V = Builder->CreateZExtOrTrunc(LHS, SI.getType()); if (ValZeros > AndZeros) @@ -734,7 +798,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { Value *TrueVal = SI.getTrueValue(); Value *FalseVal = SI.getFalseValue(); - if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal, TD)) + if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal, DL)) return ReplaceInstUsesWith(SI, V); if (SI.getType()->isIntegerTy(1)) { @@ -866,7 +930,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { if (Instruction *TI = dyn_cast<Instruction>(TrueVal)) if (Instruction *FI = dyn_cast<Instruction>(FalseVal)) if (TI->hasOneUse() && FI->hasOneUse()) { - Instruction *AddOp = 0, *SubOp = 0; + Instruction *AddOp = nullptr, *SubOp = nullptr; // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z)) if (TI->getOpcode() == FI->getOpcode()) @@ -888,7 +952,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { } if (AddOp) { - Value *OtherAddOp = 0; + Value *OtherAddOp = nullptr; if (SubOp->getOperand(0) == AddOp->getOperand(0)) { OtherAddOp = AddOp->getOperand(1); } else if (SubOp->getOperand(0) == AddOp->getOperand(1)) { @@ -901,6 +965,11 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { Value *NegVal; // Compute -Z if (SI.getType()->isFPOrFPVectorTy()) { NegVal = Builder->CreateFNeg(SubOp->getOperand(1)); + if (Instruction *NegInst = dyn_cast<Instruction>(NegVal)) { + FastMathFlags Flags = AddOp->getFastMathFlags(); + Flags &= SubOp->getFastMathFlags(); + NegInst->setFastMathFlags(Flags); + } } else { NegVal = Builder->CreateNeg(SubOp->getOperand(1)); } @@ -913,9 +982,15 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { Builder->CreateSelect(CondVal, NewTrueOp, NewFalseOp, SI.getName() + ".p"); - if (SI.getType()->isFPOrFPVectorTy()) - return BinaryOperator::CreateFAdd(SubOp->getOperand(0), NewSel); - else + if (SI.getType()->isFPOrFPVectorTy()) { + Instruction *RI = + BinaryOperator::CreateFAdd(SubOp->getOperand(0), NewSel); + + FastMathFlags Flags = AddOp->getFastMathFlags(); + Flags &= SubOp->getFastMathFlags(); + RI->setFastMathFlags(Flags); + return RI; + } else return BinaryOperator::CreateAdd(SubOp->getOperand(0), NewSel); } } @@ -944,7 +1019,6 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { // TODO. // ABS(-X) -> ABS(X) - // ABS(ABS(X)) -> ABS(X) } // See if we can fold the select into a phi node if the condition is a select. @@ -958,7 +1032,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) { if (TrueSI->getCondition() == CondVal) { if (SI.getTrueValue() == TrueSI->getTrueValue()) - return 0; + return nullptr; SI.setOperand(1, TrueSI->getTrueValue()); return &SI; } @@ -966,7 +1040,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { if (SelectInst *FalseSI = dyn_cast<SelectInst>(FalseVal)) { if (FalseSI->getCondition() == CondVal) { if (SI.getFalseValue() == FalseSI->getFalseValue()) - return 0; + return nullptr; SI.setOperand(2, FalseSI->getFalseValue()); return &SI; } @@ -994,5 +1068,5 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { } } - return 0; + return nullptr; } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp index 8cf76e5..2f91c20 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -15,10 +15,12 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/Support/PatternMatch.h" +#include "llvm/IR/PatternMatch.h" using namespace llvm; using namespace PatternMatch; +#define DEBUG_TYPE "instcombine" + Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) { assert(I.getOperand(1)->getType() == I.getOperand(0)->getType()); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); @@ -33,7 +35,7 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) { if (Instruction *R = FoldOpIntoSelect(I, SI)) return R; - if (ConstantInt *CUI = dyn_cast<ConstantInt>(Op1)) + if (Constant *CUI = dyn_cast<Constant>(Op1)) if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I)) return Res; @@ -50,7 +52,7 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) { return &I; } - return 0; + return nullptr; } /// CanEvaluateShifted - See if we can compute the specified value, but shifted @@ -78,7 +80,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, // if the needed bits are already zero in the input. This allows us to reuse // the value which means that we don't care if the shift has multiple uses. // TODO: Handle opposite shift by exact value. - ConstantInt *CI = 0; + ConstantInt *CI = nullptr; if ((isLeftShift && match(I, m_LShr(m_Value(), m_ConstantInt(CI)))) || (!isLeftShift && match(I, m_Shl(m_Value(), m_ConstantInt(CI))))) { if (CI->getZExtValue() == NumBits) { @@ -115,7 +117,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, case Instruction::Shl: { // We can often fold the shift into shifts-by-a-constant. CI = dyn_cast<ConstantInt>(I->getOperand(1)); - if (CI == 0) return false; + if (!CI) return false; // We can always fold shl(c1)+shl(c2) -> shl(c1+c2). if (isLeftShift) return true; @@ -139,7 +141,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, case Instruction::LShr: { // We can often fold the shift into shifts-by-a-constant. CI = dyn_cast<ConstantInt>(I->getOperand(1)); - if (CI == 0) return false; + if (!CI) return false; // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2). if (!isLeftShift) return true; @@ -309,37 +311,38 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, -Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, +Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1, BinaryOperator &I) { bool isLeftShift = I.getOpcode() == Instruction::Shl; + ConstantInt *COp1 = nullptr; + if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(Op1)) + COp1 = dyn_cast_or_null<ConstantInt>(CV->getSplatValue()); + else if (ConstantVector *CV = dyn_cast<ConstantVector>(Op1)) + COp1 = dyn_cast_or_null<ConstantInt>(CV->getSplatValue()); + else + COp1 = dyn_cast<ConstantInt>(Op1); + + if (!COp1) + return nullptr; // See if we can propagate this shift into the input, this covers the trivial // cast of lshr(shl(x,c1),c2) as well as other more complex cases. if (I.getOpcode() != Instruction::AShr && - CanEvaluateShifted(Op0, Op1->getZExtValue(), isLeftShift, *this)) { + CanEvaluateShifted(Op0, COp1->getZExtValue(), isLeftShift, *this)) { DEBUG(dbgs() << "ICE: GetShiftedValue propagating shift through expression" " to eliminate shift:\n IN: " << *Op0 << "\n SH: " << I <<"\n"); return ReplaceInstUsesWith(I, - GetShiftedValue(Op0, Op1->getZExtValue(), isLeftShift, *this)); + GetShiftedValue(Op0, COp1->getZExtValue(), isLeftShift, *this)); } - // See if we can simplify any instructions used by the instruction whose sole // purpose is to compute bits we don't care about. uint32_t TypeBits = Op0->getType()->getScalarSizeInBits(); - // shl i32 X, 32 = 0 and srl i8 Y, 9 = 0, ... just don't eliminate - // a signed shift. - // - if (Op1->uge(TypeBits)) { - if (I.getOpcode() != Instruction::AShr) - return ReplaceInstUsesWith(I, Constant::getNullValue(Op0->getType())); - // ashr i32 X, 32 --> ashr i32 X, 31 - I.setOperand(1, ConstantInt::get(I.getType(), TypeBits-1)); - return &I; - } + assert(!COp1->uge(TypeBits) && + "Shift over the type width should have been removed already"); // ((X*C1) << C2) == (X * (C1 << C2)) if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0)) @@ -367,7 +370,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, if (TrOp && I.isLogicalShift() && TrOp->isShift() && isa<ConstantInt>(TrOp->getOperand(1))) { // Okay, we'll do this xform. Make the shift of shift. - Constant *ShAmt = ConstantExpr::getZExt(Op1, TrOp->getType()); + Constant *ShAmt = ConstantExpr::getZExt(COp1, TrOp->getType()); // (shift2 (shift1 & 0x00FF), c2) Value *NSh = Builder->CreateBinOp(I.getOpcode(), TrOp, ShAmt,I.getName()); @@ -384,10 +387,10 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, // shift. We know that it is a logical shift by a constant, so adjust the // mask as appropriate. if (I.getOpcode() == Instruction::Shl) - MaskV <<= Op1->getZExtValue(); + MaskV <<= COp1->getZExtValue(); else { assert(I.getOpcode() == Instruction::LShr && "Unknown logical shift"); - MaskV = MaskV.lshr(Op1->getZExtValue()); + MaskV = MaskV.lshr(COp1->getZExtValue()); } // shift1 & 0x00FF @@ -421,9 +424,13 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, // (X + (Y << C)) Value *X = Builder->CreateBinOp(Op0BO->getOpcode(), YS, V1, Op0BO->getOperand(1)->getName()); - uint32_t Op1Val = Op1->getLimitedValue(TypeBits); - return BinaryOperator::CreateAnd(X, ConstantInt::get(I.getContext(), - APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val))); + uint32_t Op1Val = COp1->getLimitedValue(TypeBits); + + APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val); + Constant *Mask = ConstantInt::get(I.getContext(), Bits); + if (VectorType *VT = dyn_cast<VectorType>(X->getType())) + Mask = ConstantVector::getSplat(VT->getNumElements(), Mask); + return BinaryOperator::CreateAnd(X, Mask); } // Turn (Y + ((X >> C) & CC)) << C -> ((X & (CC << C)) + (Y << C)) @@ -453,9 +460,13 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, // (X + (Y << C)) Value *X = Builder->CreateBinOp(Op0BO->getOpcode(), V1, YS, Op0BO->getOperand(0)->getName()); - uint32_t Op1Val = Op1->getLimitedValue(TypeBits); - return BinaryOperator::CreateAnd(X, ConstantInt::get(I.getContext(), - APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val))); + uint32_t Op1Val = COp1->getLimitedValue(TypeBits); + + APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val); + Constant *Mask = ConstantInt::get(I.getContext(), Bits); + if (VectorType *VT = dyn_cast<VectorType>(X->getType())) + Mask = ConstantVector::getSplat(VT->getNumElements(), Mask); + return BinaryOperator::CreateAnd(X, Mask); } // Turn (((X >> C)&CC) + Y) << C -> (X + (Y << C)) & (CC << C) @@ -523,7 +534,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, // Find out if this is a shift of a shift by a constant. BinaryOperator *ShiftOp = dyn_cast<BinaryOperator>(Op0); if (ShiftOp && !ShiftOp->isShift()) - ShiftOp = 0; + ShiftOp = nullptr; if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) { @@ -541,9 +552,9 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, ConstantInt *ShiftAmt1C = cast<ConstantInt>(ShiftOp->getOperand(1)); uint32_t ShiftAmt1 = ShiftAmt1C->getLimitedValue(TypeBits); - uint32_t ShiftAmt2 = Op1->getLimitedValue(TypeBits); + uint32_t ShiftAmt2 = COp1->getLimitedValue(TypeBits); assert(ShiftAmt2 != 0 && "Should have been simplified earlier"); - if (ShiftAmt1 == 0) return 0; // Will be simplified in the future. + if (ShiftAmt1 == 0) return nullptr; // Will be simplified in the future. Value *X = ShiftOp->getOperand(0); IntegerType *Ty = cast<IntegerType>(I.getType()); @@ -671,13 +682,16 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, } } } - return 0; + return nullptr; } Instruction *InstCombiner::visitShl(BinaryOperator &I) { + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1), I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), - TD)) + DL)) return ReplaceInstUsesWith(I, V); if (Instruction *V = commonShiftTransforms(I)) @@ -709,12 +723,15 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) { match(I.getOperand(1), m_Constant(C2))) return BinaryOperator::CreateShl(ConstantExpr::getShl(C1, C2), A); - return 0; + return nullptr; } Instruction *InstCombiner::visitLShr(BinaryOperator &I) { + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), - I.isExact(), TD)) + I.isExact(), DL)) return ReplaceInstUsesWith(I, V); if (Instruction *R = commonShiftTransforms(I)) @@ -749,12 +766,15 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) { } } - return 0; + return nullptr; } Instruction *InstCombiner::visitAShr(BinaryOperator &I) { + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); + if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), - I.isExact(), TD)) + I.isExact(), DL)) return ReplaceInstUsesWith(I, V); if (Instruction *R = commonShiftTransforms(I)) @@ -769,11 +789,6 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) { // have a sign-extend idiom. Value *X; if (match(Op0, m_Shl(m_Value(X), m_Specific(Op1)))) { - // If the left shift is just shifting out partial signbits, delete the - // extension. - if (cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap()) - return ReplaceInstUsesWith(I, X); - // If the input is an extension from the shifted amount value, e.g. // %x = zext i8 %A to i32 // %y = shl i32 %x, 24 @@ -800,11 +815,5 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) { APInt::getSignBit(I.getType()->getScalarSizeInBits()))) return BinaryOperator::CreateLShr(Op0, Op1); - // Arithmetic shifting an all-sign-bit value is a no-op. - unsigned NumSignBits = ComputeNumSignBits(Op0); - if (NumSignBits == Op0->getType()->getScalarSizeInBits()) - return ReplaceInstUsesWith(I, Op0); - - return 0; + return nullptr; } - diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index c831ddd..1b42d3d 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -12,15 +12,16 @@ // //===----------------------------------------------------------------------===// - #include "InstCombine.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/Support/PatternMatch.h" +#include "llvm/IR/PatternMatch.h" using namespace llvm; using namespace llvm::PatternMatch; +#define DEBUG_TYPE "instcombine" + /// ShrinkDemandedConstant - Check to see if the specified operand of the /// specified instruction is a constant integer. If so, check to see if there /// are any bits set in the constant that are not demanded. If so, shrink the @@ -57,7 +58,7 @@ bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) { Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, KnownZero, KnownOne, 0); - if (V == 0) return false; + if (!V) return false; if (V == &Inst) return true; ReplaceInstUsesWith(Inst, V); return true; @@ -71,7 +72,7 @@ bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask, unsigned Depth) { Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, KnownZero, KnownOne, Depth); - if (NewVal == 0) return false; + if (!NewVal) return false; U = NewVal; return true; } @@ -101,13 +102,13 @@ bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask, Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, APInt &KnownZero, APInt &KnownOne, unsigned Depth) { - assert(V != 0 && "Null pointer of Value???"); + assert(V != nullptr && "Null pointer of Value???"); assert(Depth <= 6 && "Limit Search Depth"); uint32_t BitWidth = DemandedMask.getBitWidth(); Type *VTy = V->getType(); - assert((TD || !VTy->isPointerTy()) && + assert((DL || !VTy->isPointerTy()) && "SimplifyDemandedBits needs to know bit widths!"); - assert((!TD || TD->getTypeSizeInBits(VTy->getScalarType()) == BitWidth) && + assert((!DL || DL->getTypeSizeInBits(VTy->getScalarType()) == BitWidth) && (!VTy->isIntOrIntVectorTy() || VTy->getScalarSizeInBits() == BitWidth) && KnownZero.getBitWidth() == BitWidth && @@ -118,33 +119,33 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // We know all of the bits for a constant! KnownOne = CI->getValue() & DemandedMask; KnownZero = ~KnownOne & DemandedMask; - return 0; + return nullptr; } if (isa<ConstantPointerNull>(V)) { // We know all of the bits for a constant! KnownOne.clearAllBits(); KnownZero = DemandedMask; - return 0; + return nullptr; } KnownZero.clearAllBits(); KnownOne.clearAllBits(); if (DemandedMask == 0) { // Not demanding any bits from V. if (isa<UndefValue>(V)) - return 0; + return nullptr; return UndefValue::get(VTy); } if (Depth == 6) // Limit search depth. - return 0; + return nullptr; APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0); APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); Instruction *I = dyn_cast<Instruction>(V); if (!I) { - ComputeMaskedBits(V, KnownZero, KnownOne, Depth); - return 0; // Only analyze instructions. + computeKnownBits(V, KnownZero, KnownOne, Depth); + return nullptr; // Only analyze instructions. } // If there are multiple uses of this value and we aren't at the root, then @@ -157,8 +158,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // this instruction has a simpler value in that context. if (I->getOpcode() == Instruction::And) { // If either the LHS or the RHS are Zero, the result is zero. - ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1); - ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1); + computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1); + computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1); // If all of the demanded bits are known 1 on one side, return the other. // These bits cannot contribute to the result of the 'and' in this @@ -179,8 +180,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // only bits from X or Y are demanded. // If either the LHS or the RHS are One, the result is One. - ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1); - ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1); + computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1); + computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1); // If all of the demanded bits are known zero on one side, return the // other. These bits cannot contribute to the result of the 'or' in this @@ -204,8 +205,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // We can simplify (X^Y) -> X or Y in the user's context if we know that // only bits from X or Y are demanded. - ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1); - ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1); + computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1); + computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1); // If all of the demanded bits are known zero on one side, return the // other. @@ -216,8 +217,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, } // Compute the KnownZero/KnownOne bits to simplify things downstream. - ComputeMaskedBits(I, KnownZero, KnownOne, Depth); - return 0; + computeKnownBits(I, KnownZero, KnownOne, Depth); + return nullptr; } // If this is the root being simplified, allow it to have multiple uses, @@ -229,7 +230,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, switch (I->getOpcode()) { default: - ComputeMaskedBits(I, KnownZero, KnownOne, Depth); + computeKnownBits(I, KnownZero, KnownOne, Depth); break; case Instruction::And: // If either the LHS or the RHS are Zero, the result is zero. @@ -409,20 +410,20 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, } case Instruction::BitCast: if (!I->getOperand(0)->getType()->isIntOrIntVectorTy()) - return 0; // vector->int or fp->int? + return nullptr; // vector->int or fp->int? if (VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) { if (VectorType *SrcVTy = dyn_cast<VectorType>(I->getOperand(0)->getType())) { if (DstVTy->getNumElements() != SrcVTy->getNumElements()) // Don't touch a bitcast between vectors of different element counts. - return 0; + return nullptr; } else // Don't touch a scalar-to-vector bitcast. - return 0; + return nullptr; } else if (I->getOperand(0)->getType()->isVectorTy()) // Don't touch a vector-to-scalar bitcast. - return 0; + return nullptr; if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero, KnownOne, Depth+1)) @@ -578,9 +579,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, return I; } - // Otherwise just hand the sub off to ComputeMaskedBits to fill in + // Otherwise just hand the sub off to computeKnownBits to fill in // the known zeros and ones. - ComputeMaskedBits(V, KnownZero, KnownOne, Depth); + computeKnownBits(V, KnownZero, KnownOne, Depth); // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known // zero. @@ -751,7 +752,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // remainder is zero. if (DemandedMask.isNegative() && KnownZero.isNonNegative()) { APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0); - ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1); + computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1); // If it's known zero, our sign bit is also zero. if (LHSKnownZero.isNegative()) KnownZero.setBit(KnownZero.getBitWidth() - 1); @@ -810,10 +811,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, } case Intrinsic::x86_sse42_crc32_64_64: KnownZero = APInt::getHighBitsSet(64, 32); - return 0; + return nullptr; } } - ComputeMaskedBits(V, KnownZero, KnownOne, Depth); + computeKnownBits(V, KnownZero, KnownOne, Depth); break; } @@ -821,7 +822,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // constant. if ((DemandedMask & (KnownZero|KnownOne)) == DemandedMask) return Constant::getIntegerValue(VTy, KnownOne); - return 0; + return nullptr; } /// Helper routine of SimplifyDemandedUseBits. It tries to simplify @@ -847,13 +848,13 @@ Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShlOp1 = cast<ConstantInt>(Shl->getOperand(1))->getValue(); const APInt &ShrOp1 = cast<ConstantInt>(Shr->getOperand(1))->getValue(); if (!ShlOp1 || !ShrOp1) - return 0; // Noop. + return nullptr; // Noop. Value *VarX = Shr->getOperand(0); Type *Ty = VarX->getType(); unsigned BitWidth = Ty->getIntegerBitWidth(); if (ShlOp1.uge(BitWidth) || ShrOp1.uge(BitWidth)) - return 0; // Undef. + return nullptr; // Undef. unsigned ShlAmt = ShlOp1.getZExtValue(); unsigned ShrAmt = ShrOp1.getZExtValue(); @@ -882,7 +883,7 @@ Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr, return VarX; if (!Shr->hasOneUse()) - return 0; + return nullptr; BinaryOperator *New; if (ShrAmt < ShlAmt) { @@ -902,7 +903,7 @@ Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr, return InsertNewInstWith(New, *Shl); } - return 0; + return nullptr; } /// SimplifyDemandedVectorElts - The specified value produces a vector with @@ -923,7 +924,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, if (isa<UndefValue>(V)) { // If the entire vector is undefined, just return this info. UndefElts = EltMask; - return 0; + return nullptr; } if (DemandedElts == 0) { // If nothing is demanded, provide undef. @@ -938,7 +939,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // Check if this is identity. If so, return 0 since we are not simplifying // anything. if (DemandedElts.isAllOnesValue()) - return 0; + return nullptr; Type *EltTy = cast<VectorType>(V->getType())->getElementType(); Constant *Undef = UndefValue::get(EltTy); @@ -952,7 +953,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, } Constant *Elt = C->getAggregateElement(i); - if (Elt == 0) return 0; + if (!Elt) return nullptr; if (isa<UndefValue>(Elt)) { // Already undef. Elts.push_back(Undef); @@ -964,12 +965,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // If we changed the constant, return it. Constant *NewCV = ConstantVector::get(Elts); - return NewCV != C ? NewCV : 0; + return NewCV != C ? NewCV : nullptr; } // Limit search depth. if (Depth == 10) - return 0; + return nullptr; // If multiple users are using the root value, proceed with // simplification conservatively assuming that all elements @@ -980,14 +981,14 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // the main instcombine process. if (Depth != 0) // TODO: Just compute the UndefElts information recursively. - return 0; + return nullptr; // Conservatively assume that all elements are needed. DemandedElts = EltMask; } Instruction *I = dyn_cast<Instruction>(V); - if (!I) return 0; // Only analyze instructions. + if (!I) return nullptr; // Only analyze instructions. bool MadeChange = false; APInt UndefElts2(VWidth, 0); @@ -999,7 +1000,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // If this is a variable index, we don't know which element it overwrites. // demand exactly the same input as we produce. ConstantInt *Idx = dyn_cast<ConstantInt>(I->getOperand(2)); - if (Idx == 0) { + if (!Idx) { // Note that we can't propagate undef elt info, because we don't know // which elt is getting updated. TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts, @@ -1281,5 +1282,5 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, break; } } - return MadeChange ? I : 0; + return MadeChange ? I : nullptr; } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 30290ee..cb16584 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -13,10 +13,12 @@ //===----------------------------------------------------------------------===// #include "InstCombine.h" -#include "llvm/Support/PatternMatch.h" +#include "llvm/IR/PatternMatch.h" using namespace llvm; using namespace PatternMatch; +#define DEBUG_TYPE "instcombine" + /// CheapToScalarize - Return true if the value is cheaper to scalarize than it /// is to leave as a vector operation. isConstant indicates whether we're /// extracting one known element. If false we're extracting a variable index. @@ -73,7 +75,7 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) { if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) { // If this is an insert to a variable element, we don't know what it is. if (!isa<ConstantInt>(III->getOperand(2))) - return 0; + return nullptr; unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue(); // If this is an insert to the element we are looking for, return the @@ -97,14 +99,14 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) { } // Extract a value from a vector add operation with a constant zero. - Value *Val = 0; Constant *Con = 0; + Value *Val = nullptr; Constant *Con = nullptr; if (match(V, m_Add(m_Value(Val), m_Constant(Con)))) { if (Con->getAggregateElement(EltNo)->isNullValue()) return FindScalarElement(Val, EltNo); } // Otherwise, we don't know. - return 0; + return nullptr; } // If we have a PHI node with a vector type that has only 2 uses: feed @@ -113,12 +115,12 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) { Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { // Verify that the PHI node has exactly 2 uses. Otherwise return NULL. if (!PN->hasNUses(2)) - return NULL; + return nullptr; // If so, it's known at this point that one operand is PHI and the other is // an extractelement node. Find the PHI user that is not the extractelement // node. - Value::use_iterator iu = PN->use_begin(); + auto iu = PN->user_begin(); Instruction *PHIUser = dyn_cast<Instruction>(*iu); if (PHIUser == cast<Instruction>(&EI)) PHIUser = cast<Instruction>(*(++iu)); @@ -126,9 +128,9 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { // Verify that this PHI user has one use, which is the PHI itself, // and that it is a binary operation which is cheap to scalarize. // otherwise return NULL. - if (!PHIUser->hasOneUse() || !(PHIUser->use_back() == PN) || + if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) || !(isa<BinaryOperator>(PHIUser)) || !CheapToScalarize(PHIUser, true)) - return NULL; + return nullptr; // Create a scalar PHI node that will replace the vector PHI node // just before the current PHI node. @@ -142,7 +144,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { // If the operand is the PHI induction variable: if (PHIInVal == PHIUser) { // Scalarize the binary operation. Its first operand is the - // scalar PHI and the second operand is extracted from the other + // scalar PHI, and the second operand is extracted from the other // vector operand. BinaryOperator *B0 = cast<BinaryOperator>(PHIUser); unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0; @@ -318,7 +320,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { } } } - return 0; + return nullptr; } /// CollectSingleShuffleElements - If V is a shuffle of values that ONLY returns @@ -326,7 +328,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { /// Otherwise, return false. static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, SmallVectorImpl<Constant*> &Mask) { - assert(V->getType() == LHS->getType() && V->getType() == RHS->getType() && + assert(LHS->getType() == RHS->getType() && "Invalid CollectSingleShuffleElements"); unsigned NumElts = V->getType()->getVectorNumElements(); @@ -359,7 +361,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue(); if (isa<UndefValue>(ScalarOp)) { // inserting undef into vector. - // Okay, we can handle this if the vector we are insertinting into is + // We can handle this if the vector we are inserting into is // transitively ok. if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { // If so, update the mask to reflect the inserted undef. @@ -367,14 +369,14 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, return true; } } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){ - if (isa<ConstantInt>(EI->getOperand(1)) && - EI->getOperand(0)->getType() == V->getType()) { + if (isa<ConstantInt>(EI->getOperand(1))) { unsigned ExtractedIdx = cast<ConstantInt>(EI->getOperand(1))->getZExtValue(); + unsigned NumLHSElts = LHS->getType()->getVectorNumElements(); // This must be extracting from either LHS or RHS. if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) { - // Okay, we can handle this if the vector we are insertinting into is + // We can handle this if the vector we are inserting into is // transitively ok. if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { // If so, update the mask to reflect the inserted value. @@ -386,7 +388,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, assert(EI->getOperand(0) == RHS); Mask[InsertedIdx % NumElts] = ConstantInt::get(Type::getInt32Ty(V->getContext()), - ExtractedIdx+NumElts); + ExtractedIdx + NumLHSElts); } return true; } @@ -394,29 +396,36 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, } } } - // TODO: Handle shufflevector here! return false; } -/// CollectShuffleElements - We are building a shuffle of V, using RHS as the -/// RHS of the shuffle instruction, if it is not null. Return a shuffle mask -/// that computes V and the LHS value of the shuffle. -static Value *CollectShuffleElements(Value *V, SmallVectorImpl<Constant*> &Mask, - Value *&RHS) { - assert(V->getType()->isVectorTy() && - (RHS == 0 || V->getType() == RHS->getType()) && - "Invalid shuffle!"); + +/// We are building a shuffle to create V, which is a sequence of insertelement, +/// extractelement pairs. If PermittedRHS is set, then we must either use it or +/// not rely on the second vector source. Return a std::pair containing the +/// left and right vectors of the proposed shuffle (or 0), and set the Mask +/// parameter as required. +/// +/// Note: we intentionally don't try to fold earlier shuffles since they have +/// often been chosen carefully to be efficiently implementable on the target. +typedef std::pair<Value *, Value *> ShuffleOps; + +static ShuffleOps CollectShuffleElements(Value *V, + SmallVectorImpl<Constant *> &Mask, + Value *PermittedRHS) { + assert(V->getType()->isVectorTy() && "Invalid shuffle!"); unsigned NumElts = cast<VectorType>(V->getType())->getNumElements(); if (isa<UndefValue>(V)) { Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext()))); - return V; + return std::make_pair( + PermittedRHS ? UndefValue::get(PermittedRHS->getType()) : V, nullptr); } if (isa<ConstantAggregateZero>(V)) { Mask.assign(NumElts, ConstantInt::get(Type::getInt32Ty(V->getContext()),0)); - return V; + return std::make_pair(V, nullptr); } if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) { @@ -426,51 +435,94 @@ static Value *CollectShuffleElements(Value *V, SmallVectorImpl<Constant*> &Mask, Value *IdxOp = IEI->getOperand(2); if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) { - if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) && - EI->getOperand(0)->getType() == V->getType()) { + if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) { unsigned ExtractedIdx = cast<ConstantInt>(EI->getOperand(1))->getZExtValue(); unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue(); // Either the extracted from or inserted into vector must be RHSVec, // otherwise we'd end up with a shuffle of three inputs. - if (EI->getOperand(0) == RHS || RHS == 0) { - RHS = EI->getOperand(0); - Value *V = CollectShuffleElements(VecOp, Mask, RHS); + if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) { + Value *RHS = EI->getOperand(0); + ShuffleOps LR = CollectShuffleElements(VecOp, Mask, RHS); + assert(LR.second == nullptr || LR.second == RHS); + + if (LR.first->getType() != RHS->getType()) { + // We tried our best, but we can't find anything compatible with RHS + // further up the chain. Return a trivial shuffle. + for (unsigned i = 0; i < NumElts; ++i) + Mask[i] = ConstantInt::get(Type::getInt32Ty(V->getContext()), i); + return std::make_pair(V, nullptr); + } + + unsigned NumLHSElts = RHS->getType()->getVectorNumElements(); Mask[InsertedIdx % NumElts] = ConstantInt::get(Type::getInt32Ty(V->getContext()), - NumElts+ExtractedIdx); - return V; + NumLHSElts+ExtractedIdx); + return std::make_pair(LR.first, RHS); } - if (VecOp == RHS) { - Value *V = CollectShuffleElements(EI->getOperand(0), Mask, RHS); - // Update Mask to reflect that `ScalarOp' has been inserted at - // position `InsertedIdx' within the vector returned by IEI. - Mask[InsertedIdx % NumElts] = Mask[ExtractedIdx]; - - // Everything but the extracted element is replaced with the RHS. - for (unsigned i = 0; i != NumElts; ++i) { - if (i != InsertedIdx) - Mask[i] = ConstantInt::get(Type::getInt32Ty(V->getContext()), - NumElts+i); - } - return V; + if (VecOp == PermittedRHS) { + // We've gone as far as we can: anything on the other side of the + // extractelement will already have been converted into a shuffle. + unsigned NumLHSElts = + EI->getOperand(0)->getType()->getVectorNumElements(); + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(ConstantInt::get( + Type::getInt32Ty(V->getContext()), + i == InsertedIdx ? ExtractedIdx : NumLHSElts + i)); + return std::make_pair(EI->getOperand(0), PermittedRHS); } // If this insertelement is a chain that comes from exactly these two // vectors, return the vector and the effective shuffle. - if (CollectSingleShuffleElements(IEI, EI->getOperand(0), RHS, Mask)) - return EI->getOperand(0); + if (EI->getOperand(0)->getType() == PermittedRHS->getType() && + CollectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS, + Mask)) + return std::make_pair(EI->getOperand(0), PermittedRHS); } } } - // TODO: Handle shufflevector here! // Otherwise, can't do anything fancy. Return an identity vector. for (unsigned i = 0; i != NumElts; ++i) Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i)); - return V; + return std::make_pair(V, nullptr); +} + +/// Try to find redundant insertvalue instructions, like the following ones: +/// %0 = insertvalue { i8, i32 } undef, i8 %x, 0 +/// %1 = insertvalue { i8, i32 } %0, i8 %y, 0 +/// Here the second instruction inserts values at the same indices, as the +/// first one, making the first one redundant. +/// It should be transformed to: +/// %0 = insertvalue { i8, i32 } undef, i8 %y, 0 +Instruction *InstCombiner::visitInsertValueInst(InsertValueInst &I) { + bool IsRedundant = false; + ArrayRef<unsigned int> FirstIndices = I.getIndices(); + + // If there is a chain of insertvalue instructions (each of them except the + // last one has only one use and it's another insertvalue insn from this + // chain), check if any of the 'children' uses the same indices as the first + // instruction. In this case, the first one is redundant. + Value *V = &I; + unsigned Depth = 0; + while (V->hasOneUse() && Depth < 10) { + User *U = V->user_back(); + auto UserInsInst = dyn_cast<InsertValueInst>(U); + if (!UserInsInst || U->getOperand(0) != V) + break; + if (UserInsInst->getIndices() == FirstIndices) { + IsRedundant = true; + break; + } + V = UserInsInst; + Depth++; + } + + if (IsRedundant) + return ReplaceInstUsesWith(I, I.getOperand(0)); + return nullptr; } Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) { @@ -485,17 +537,18 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) { // If the inserted element was extracted from some other vector, and if the // indexes are constant, try to turn this into a shufflevector operation. if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) { - if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) && - EI->getOperand(0)->getType() == IE.getType()) { - unsigned NumVectorElts = IE.getType()->getNumElements(); + if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) { + unsigned NumInsertVectorElts = IE.getType()->getNumElements(); + unsigned NumExtractVectorElts = + EI->getOperand(0)->getType()->getVectorNumElements(); unsigned ExtractedIdx = cast<ConstantInt>(EI->getOperand(1))->getZExtValue(); unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue(); - if (ExtractedIdx >= NumVectorElts) // Out of range extract. + if (ExtractedIdx >= NumExtractVectorElts) // Out of range extract. return ReplaceInstUsesWith(IE, VecOp); - if (InsertedIdx >= NumVectorElts) // Out of range insert. + if (InsertedIdx >= NumInsertVectorElts) // Out of range insert. return ReplaceInstUsesWith(IE, UndefValue::get(IE.getType())); // If we are extracting a value from a vector, then inserting it right @@ -505,13 +558,19 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) { // If this insertelement isn't used by some other insertelement, turn it // (and any insertelements it points to), into one big shuffle. - if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.use_back())) { + if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.user_back())) { SmallVector<Constant*, 16> Mask; - Value *RHS = 0; - Value *LHS = CollectShuffleElements(&IE, Mask, RHS); - if (RHS == 0) RHS = UndefValue::get(LHS->getType()); - // We now have a shuffle of LHS, RHS, Mask. - return new ShuffleVectorInst(LHS, RHS, ConstantVector::get(Mask)); + ShuffleOps LR = CollectShuffleElements(&IE, Mask, nullptr); + + // The proposed shuffle may be trivial, in which case we shouldn't + // perform the combine. + if (LR.first != &IE && LR.second != &IE) { + // We now have a shuffle of LHS, RHS, Mask. + if (LR.second == nullptr) + LR.second = UndefValue::get(LR.first->getType()); + return new ShuffleVectorInst(LR.first, LR.second, + ConstantVector::get(Mask)); + } } } } @@ -525,7 +584,7 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) { return &IE; } - return 0; + return nullptr; } /// Return true if we can evaluate the specified expression tree if the vector @@ -640,6 +699,8 @@ static Value *BuildNew(Instruction *I, ArrayRef<Value*> NewOps) { if (isa<PossiblyExactOperator>(BO)) { New->setIsExact(BO->isExact()); } + if (isa<FPMathOperator>(BO)) + New->copyFastMathFlags(I); return New; } case Instruction::ICmp: @@ -765,9 +826,10 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { } } + // If element is not in Mask, no need to handle the operand 1 (element to + // be inserted). Just evaluate values in operand 0 according to Mask. if (!Found) - return UndefValue::get( - VectorType::get(V->getType()->getScalarType(), Mask.size())); + return EvaluateInDifferentElementOrder(I->getOperand(0), Mask); Value *V = EvaluateInDifferentElementOrder(I->getOperand(0), Mask); return InsertElementInst::Create(V, I->getOperand(1), @@ -777,6 +839,20 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { llvm_unreachable("failed to reorder elements of vector instruction!"); } +static void RecognizeIdentityMask(const SmallVectorImpl<int> &Mask, + bool &isLHSID, bool &isRHSID) { + isLHSID = isRHSID = true; + + for (unsigned i = 0, e = Mask.size(); i != e; ++i) { + if (Mask[i] < 0) continue; // Ignore undef values. + // Is this an identity shuffle of the LHS value? + isLHSID &= (Mask[i] == (int)i); + + // Is this an identity shuffle of the RHS value? + isRHSID &= (Mask[i]-e == i); + } +} + Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Value *LHS = SVI.getOperand(0); Value *RHS = SVI.getOperand(1); @@ -840,16 +916,8 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { if (VWidth == LHSWidth) { // Analyze the shuffle, are the LHS or RHS and identity shuffles? - bool isLHSID = true, isRHSID = true; - - for (unsigned i = 0, e = Mask.size(); i != e; ++i) { - if (Mask[i] < 0) continue; // Ignore undef values. - // Is this an identity shuffle of the LHS value? - isLHSID &= (Mask[i] == (int)i); - - // Is this an identity shuffle of the RHS value? - isRHSID &= (Mask[i]-e == i); - } + bool isLHSID, isRHSID; + RecognizeIdentityMask(Mask, isLHSID, isRHSID); // Eliminate identity shuffles. if (isLHSID) return ReplaceInstUsesWith(SVI, LHS); @@ -908,16 +976,16 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { ShuffleVectorInst* RHSShuffle = dyn_cast<ShuffleVectorInst>(RHS); if (LHSShuffle) if (!isa<UndefValue>(LHSShuffle->getOperand(1)) && !isa<UndefValue>(RHS)) - LHSShuffle = NULL; + LHSShuffle = nullptr; if (RHSShuffle) if (!isa<UndefValue>(RHSShuffle->getOperand(1))) - RHSShuffle = NULL; + RHSShuffle = nullptr; if (!LHSShuffle && !RHSShuffle) - return MadeChange ? &SVI : 0; + return MadeChange ? &SVI : nullptr; - Value* LHSOp0 = NULL; - Value* LHSOp1 = NULL; - Value* RHSOp0 = NULL; + Value* LHSOp0 = nullptr; + Value* LHSOp1 = nullptr; + Value* RHSOp0 = nullptr; unsigned LHSOp0Width = 0; unsigned RHSOp0Width = 0; if (LHSShuffle) { @@ -949,11 +1017,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { // case 4 if (LHSOp0 == RHSOp0) { newLHS = LHSOp0; - newRHS = NULL; + newRHS = nullptr; } if (newLHS == LHS && newRHS == RHS) - return MadeChange ? &SVI : 0; + return MadeChange ? &SVI : nullptr; SmallVector<int, 16> LHSMask; SmallVector<int, 16> RHSMask; @@ -1012,8 +1080,8 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { // references from RHSOp0 to LHSOp0, so we don't need to shift the mask. // If newRHS == newLHS, we want to remap any references from newRHS to // newLHS so that we can properly identify splats that may occur due to - // obfuscation accross the two vectors. - if (eltMask >= 0 && newRHS != NULL && newLHS != newRHS) + // obfuscation across the two vectors. + if (eltMask >= 0 && newRHS != nullptr && newLHS != newRHS) eltMask += newLHSWidth; } @@ -1039,10 +1107,17 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Elts.push_back(ConstantInt::get(Int32Ty, newMask[i])); } } - if (newRHS == NULL) + if (!newRHS) newRHS = UndefValue::get(newLHS->getType()); return new ShuffleVectorInst(newLHS, newRHS, ConstantVector::get(Elts)); } - return MadeChange ? &SVI : 0; + // If the result mask is an identity, replace uses of this instruction with + // corresponding argument. + bool isLHSID, isRHSID; + RecognizeIdentityMask(newMask, isLHSID, isRHSID); + if (isLHSID && VWidth == LHSOp0Width) return ReplaceInstUsesWith(SVI, newLHS); + if (isRHSID && VWidth == RHSOp0Width) return ReplaceInstUsesWith(SVI, newRHS); + + return MadeChange ? &SVI : nullptr; } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h index f84db27..1ab7db3 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h @@ -10,7 +10,6 @@ #ifndef INSTCOMBINE_WORKLIST_H #define INSTCOMBINE_WORKLIST_H -#define DEBUG_TYPE "instcombine" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/IR/Instruction.h" @@ -18,6 +17,8 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#define DEBUG_TYPE "instcombine" + namespace llvm { /// InstCombineWorklist - This is the worklist management logic for @@ -68,7 +69,7 @@ public: if (It == WorklistMap.end()) return; // Not in worklist. // Don't bother moving everything down, just null out the slot. - Worklist[It->second] = 0; + Worklist[It->second] = nullptr; WorklistMap.erase(It); } @@ -84,9 +85,8 @@ public: /// now. /// void AddUsersToWorkList(Instruction &I) { - for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); - UI != UE; ++UI) - Add(cast<Instruction>(*UI)); + for (User *U : I.users()) + Add(cast<Instruction>(U)); } @@ -102,4 +102,6 @@ public: } // end namespace llvm. +#undef DEBUG_TYPE + #endif diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 191a101..d3648e2 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -33,7 +33,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "instcombine" #include "llvm/Transforms/Scalar.h" #include "InstCombine.h" #include "llvm-c/Initialization.h" @@ -43,14 +42,15 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/Support/CFG.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/GetElementPtrTypeIterator.h" -#include "llvm/Support/PatternMatch.h" -#include "llvm/Support/ValueHandle.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> @@ -58,6 +58,8 @@ using namespace llvm; using namespace llvm::PatternMatch; +#define DEBUG_TYPE "instcombine" + STATISTIC(NumCombined , "Number of insts combined"); STATISTIC(NumConstProp, "Number of constant folds"); STATISTIC(NumDeadInst , "Number of dead inst eliminated"); @@ -103,13 +105,13 @@ Value *InstCombiner::EmitGEPOffset(User *GEP) { bool InstCombiner::ShouldChangeType(Type *From, Type *To) const { assert(From->isIntegerTy() && To->isIntegerTy()); - // If we don't have TD, we don't know if the source/dest are legal. - if (!TD) return false; + // If we don't have DL, we don't know if the source/dest are legal. + if (!DL) return false; unsigned FromWidth = From->getPrimitiveSizeInBits(); unsigned ToWidth = To->getPrimitiveSizeInBits(); - bool FromLegal = TD->isLegalInteger(FromWidth); - bool ToLegal = TD->isLegalInteger(ToWidth); + bool FromLegal = DL->isLegalInteger(FromWidth); + bool ToLegal = DL->isLegalInteger(ToWidth); // If this is a legal integer from type, and the result would be an illegal // type, don't do the transformation. @@ -221,7 +223,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { Value *C = I.getOperand(1); // Does "B op C" simplify? - if (Value *V = SimplifyBinOp(Opcode, B, C, TD)) { + if (Value *V = SimplifyBinOp(Opcode, B, C, DL)) { // It simplifies to V. Form "A op V". I.setOperand(0, A); I.setOperand(1, V); @@ -250,7 +252,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { Value *C = Op1->getOperand(1); // Does "A op B" simplify? - if (Value *V = SimplifyBinOp(Opcode, A, B, TD)) { + if (Value *V = SimplifyBinOp(Opcode, A, B, DL)) { // It simplifies to V. Form "V op C". I.setOperand(0, V); I.setOperand(1, C); @@ -272,7 +274,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { Value *C = I.getOperand(1); // Does "C op A" simplify? - if (Value *V = SimplifyBinOp(Opcode, C, A, TD)) { + if (Value *V = SimplifyBinOp(Opcode, C, A, DL)) { // It simplifies to V. Form "V op B". I.setOperand(0, V); I.setOperand(1, B); @@ -292,7 +294,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { Value *C = Op1->getOperand(1); // Does "C op A" simplify? - if (Value *V = SimplifyBinOp(Opcode, C, A, TD)) { + if (Value *V = SimplifyBinOp(Opcode, C, A, DL)) { // It simplifies to V. Form "B op V". I.setOperand(0, B); I.setOperand(1, V); @@ -319,6 +321,12 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { Constant *Folded = ConstantExpr::get(Opcode, C1, C2); BinaryOperator *New = BinaryOperator::Create(Opcode, A, B); + if (isa<FPMathOperator>(New)) { + FastMathFlags Flags = I.getFastMathFlags(); + Flags &= Op0->getFastMathFlags(); + Flags &= Op1->getFastMathFlags(); + New->setFastMathFlags(Flags); + } InsertNewInstWith(New, I); New->takeName(Op1); I.setOperand(0, New); @@ -388,6 +396,127 @@ static bool RightDistributesOverLeft(Instruction::BinaryOps LOp, return false; } +/// This function returns identity value for given opcode, which can be used to +/// factor patterns like (X * 2) + X ==> (X * 2) + (X * 1) ==> X * (2 + 1). +static Value *getIdentityValue(Instruction::BinaryOps OpCode, Value *V) { + if (isa<Constant>(V)) + return nullptr; + + if (OpCode == Instruction::Mul) + return ConstantInt::get(V->getType(), 1); + + // TODO: We can handle other cases e.g. Instruction::And, Instruction::Or etc. + + return nullptr; +} + +/// This function factors binary ops which can be combined using distributive +/// laws. This also factor SHL as MUL e.g. SHL(X, 2) ==> MUL(X, 4). +static Instruction::BinaryOps +getBinOpsForFactorization(BinaryOperator *Op, Value *&LHS, Value *&RHS) { + if (!Op) + return Instruction::BinaryOpsEnd; + + if (Op->getOpcode() == Instruction::Shl) { + if (Constant *CST = dyn_cast<Constant>(Op->getOperand(1))) { + // The multiplier is really 1 << CST. + RHS = ConstantExpr::getShl(ConstantInt::get(Op->getType(), 1), CST); + LHS = Op->getOperand(0); + return Instruction::Mul; + } + } + + // TODO: We can add other conversions e.g. shr => div etc. + + LHS = Op->getOperand(0); + RHS = Op->getOperand(1); + return Op->getOpcode(); +} + +/// This tries to simplify binary operations by factorizing out common terms +/// (e. g. "(A*B)+(A*C)" -> "A*(B+C)"). +static Value *tryFactorization(InstCombiner::BuilderTy *Builder, + const DataLayout *DL, BinaryOperator &I, + Instruction::BinaryOps InnerOpcode, Value *A, + Value *B, Value *C, Value *D) { + + // If any of A, B, C, D are null, we can not factor I, return early. + // Checking A and C should be enough. + if (!A || !C || !B || !D) + return nullptr; + + Value *SimplifiedInst = nullptr; + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + Instruction::BinaryOps TopLevelOpcode = I.getOpcode(); + + // Does "X op' Y" always equal "Y op' X"? + bool InnerCommutative = Instruction::isCommutative(InnerOpcode); + + // Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"? + if (LeftDistributesOverRight(InnerOpcode, TopLevelOpcode)) + // Does the instruction have the form "(A op' B) op (A op' D)" or, in the + // commutative case, "(A op' B) op (C op' A)"? + if (A == C || (InnerCommutative && A == D)) { + if (A != C) + std::swap(C, D); + // Consider forming "A op' (B op D)". + // If "B op D" simplifies then it can be formed with no cost. + Value *V = SimplifyBinOp(TopLevelOpcode, B, D, DL); + // If "B op D" doesn't simplify then only go on if both of the existing + // operations "A op' B" and "C op' D" will be zapped as no longer used. + if (!V && LHS->hasOneUse() && RHS->hasOneUse()) + V = Builder->CreateBinOp(TopLevelOpcode, B, D, RHS->getName()); + if (V) { + SimplifiedInst = Builder->CreateBinOp(InnerOpcode, A, V); + } + } + + // Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"? + if (!SimplifiedInst && RightDistributesOverLeft(TopLevelOpcode, InnerOpcode)) + // Does the instruction have the form "(A op' B) op (C op' B)" or, in the + // commutative case, "(A op' B) op (B op' D)"? + if (B == D || (InnerCommutative && B == C)) { + if (B != D) + std::swap(C, D); + // Consider forming "(A op C) op' B". + // If "A op C" simplifies then it can be formed with no cost. + Value *V = SimplifyBinOp(TopLevelOpcode, A, C, DL); + + // If "A op C" doesn't simplify then only go on if both of the existing + // operations "A op' B" and "C op' D" will be zapped as no longer used. + if (!V && LHS->hasOneUse() && RHS->hasOneUse()) + V = Builder->CreateBinOp(TopLevelOpcode, A, C, LHS->getName()); + if (V) { + SimplifiedInst = Builder->CreateBinOp(InnerOpcode, V, B); + } + } + + if (SimplifiedInst) { + ++NumFactor; + SimplifiedInst->takeName(&I); + + // Check if we can add NSW flag to SimplifiedInst. If so, set NSW flag. + // TODO: Check for NUW. + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(SimplifiedInst)) { + if (isa<OverflowingBinaryOperator>(SimplifiedInst)) { + bool HasNSW = false; + if (isa<OverflowingBinaryOperator>(&I)) + HasNSW = I.hasNoSignedWrap(); + + if (BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS)) + if (isa<OverflowingBinaryOperator>(Op0)) + HasNSW &= Op0->hasNoSignedWrap(); + + if (BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS)) + if (isa<OverflowingBinaryOperator>(Op1)) + HasNSW &= Op1->hasNoSignedWrap(); + BO->setHasNoSignedWrap(HasNSW); + } + } + } + return SimplifiedInst; +} + /// SimplifyUsingDistributiveLaws - This tries to simplify binary operations /// which some other binary operation distributes over either by factorizing /// out common terms (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this @@ -397,65 +526,33 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS); BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS); - Instruction::BinaryOps TopLevelOpcode = I.getOpcode(); // op // Factorization. - if (Op0 && Op1 && Op0->getOpcode() == Op1->getOpcode()) { - // The instruction has the form "(A op' B) op (C op' D)". Try to factorize - // a common term. - Value *A = Op0->getOperand(0), *B = Op0->getOperand(1); - Value *C = Op1->getOperand(0), *D = Op1->getOperand(1); - Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op' + Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr; + Instruction::BinaryOps LHSOpcode = getBinOpsForFactorization(Op0, A, B); + Instruction::BinaryOps RHSOpcode = getBinOpsForFactorization(Op1, C, D); + + // The instruction has the form "(A op' B) op (C op' D)". Try to factorize + // a common term. + if (LHSOpcode == RHSOpcode) { + if (Value *V = tryFactorization(Builder, DL, I, LHSOpcode, A, B, C, D)) + return V; + } - // Does "X op' Y" always equal "Y op' X"? - bool InnerCommutative = Instruction::isCommutative(InnerOpcode); - - // Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"? - if (LeftDistributesOverRight(InnerOpcode, TopLevelOpcode)) - // Does the instruction have the form "(A op' B) op (A op' D)" or, in the - // commutative case, "(A op' B) op (C op' A)"? - if (A == C || (InnerCommutative && A == D)) { - if (A != C) - std::swap(C, D); - // Consider forming "A op' (B op D)". - // If "B op D" simplifies then it can be formed with no cost. - Value *V = SimplifyBinOp(TopLevelOpcode, B, D, TD); - // If "B op D" doesn't simplify then only go on if both of the existing - // operations "A op' B" and "C op' D" will be zapped as no longer used. - if (!V && Op0->hasOneUse() && Op1->hasOneUse()) - V = Builder->CreateBinOp(TopLevelOpcode, B, D, Op1->getName()); - if (V) { - ++NumFactor; - V = Builder->CreateBinOp(InnerOpcode, A, V); - V->takeName(&I); - return V; - } - } + // The instruction has the form "(A op' B) op (C)". Try to factorize common + // term. + if (Value *V = tryFactorization(Builder, DL, I, LHSOpcode, A, B, RHS, + getIdentityValue(LHSOpcode, RHS))) + return V; - // Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"? - if (RightDistributesOverLeft(TopLevelOpcode, InnerOpcode)) - // Does the instruction have the form "(A op' B) op (C op' B)" or, in the - // commutative case, "(A op' B) op (B op' D)"? - if (B == D || (InnerCommutative && B == C)) { - if (B != D) - std::swap(C, D); - // Consider forming "(A op C) op' B". - // If "A op C" simplifies then it can be formed with no cost. - Value *V = SimplifyBinOp(TopLevelOpcode, A, C, TD); - // If "A op C" doesn't simplify then only go on if both of the existing - // operations "A op' B" and "C op' D" will be zapped as no longer used. - if (!V && Op0->hasOneUse() && Op1->hasOneUse()) - V = Builder->CreateBinOp(TopLevelOpcode, A, C, Op0->getName()); - if (V) { - ++NumFactor; - V = Builder->CreateBinOp(InnerOpcode, V, B); - V->takeName(&I); - return V; - } - } - } + // The instruction has the form "(B) op (C op' D)". Try to factorize common + // term. + if (Value *V = tryFactorization(Builder, DL, I, RHSOpcode, LHS, + getIdentityValue(RHSOpcode, LHS), C, D)) + return V; // Expansion. + Instruction::BinaryOps TopLevelOpcode = I.getOpcode(); if (Op0 && RightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) { // The instruction has the form "(A op' B) op C". See if expanding it out // to "(A op C) op' (B op C)" results in simplifications. @@ -463,8 +560,8 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op' // Do "A op C" and "B op C" both simplify? - if (Value *L = SimplifyBinOp(TopLevelOpcode, A, C, TD)) - if (Value *R = SimplifyBinOp(TopLevelOpcode, B, C, TD)) { + if (Value *L = SimplifyBinOp(TopLevelOpcode, A, C, DL)) + if (Value *R = SimplifyBinOp(TopLevelOpcode, B, C, DL)) { // They do! Return "L op' R". ++NumExpand; // If "L op' R" equals "A op' B" then "L op' R" is just the LHS. @@ -472,7 +569,7 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { (Instruction::isCommutative(InnerOpcode) && L == B && R == A)) return Op0; // Otherwise return "L op' R" if it simplifies. - if (Value *V = SimplifyBinOp(InnerOpcode, L, R, TD)) + if (Value *V = SimplifyBinOp(InnerOpcode, L, R, DL)) return V; // Otherwise, create a new instruction. C = Builder->CreateBinOp(InnerOpcode, L, R); @@ -488,8 +585,8 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { Instruction::BinaryOps InnerOpcode = Op1->getOpcode(); // op' // Do "A op B" and "A op C" both simplify? - if (Value *L = SimplifyBinOp(TopLevelOpcode, A, B, TD)) - if (Value *R = SimplifyBinOp(TopLevelOpcode, A, C, TD)) { + if (Value *L = SimplifyBinOp(TopLevelOpcode, A, B, DL)) + if (Value *R = SimplifyBinOp(TopLevelOpcode, A, C, DL)) { // They do! Return "L op' R". ++NumExpand; // If "L op' R" equals "B op' C" then "L op' R" is just the RHS. @@ -497,7 +594,7 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { (Instruction::isCommutative(InnerOpcode) && L == C && R == B)) return Op1; // Otherwise return "L op' R" if it simplifies. - if (Value *V = SimplifyBinOp(InnerOpcode, L, R, TD)) + if (Value *V = SimplifyBinOp(InnerOpcode, L, R, DL)) return V; // Otherwise, create a new instruction. A = Builder->CreateBinOp(InnerOpcode, L, R); @@ -506,7 +603,7 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { } } - return 0; + return nullptr; } // dyn_castNegVal - Given a 'sub' instruction, return the RHS of the instruction @@ -524,7 +621,7 @@ Value *InstCombiner::dyn_castNegVal(Value *V) const { if (C->getType()->getElementType()->isIntegerTy()) return ConstantExpr::getNeg(C); - return 0; + return nullptr; } // dyn_castFNegVal - Given a 'fsub' instruction, return the RHS of the @@ -543,7 +640,7 @@ Value *InstCombiner::dyn_castFNegVal(Value *V, bool IgnoreZeroSign) const { if (C->getType()->getElementType()->isFloatingPointTy()) return ConstantExpr::getFNeg(C); - return 0; + return nullptr; } static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO, @@ -566,9 +663,14 @@ static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO, if (!ConstIsRHS) std::swap(Op0, Op1); - if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&I)) - return IC->Builder->CreateBinOp(BO->getOpcode(), Op0, Op1, + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&I)) { + Value *RI = IC->Builder->CreateBinOp(BO->getOpcode(), Op0, Op1, SO->getName()+".op"); + Instruction *FPInst = dyn_cast<Instruction>(RI); + if (FPInst && isa<FPMathOperator>(FPInst)) + FPInst->copyFastMathFlags(BO); + return RI; + } if (ICmpInst *CI = dyn_cast<ICmpInst>(&I)) return IC->Builder->CreateICmp(CI->getPredicate(), Op0, Op1, SO->getName()+".cmp"); @@ -584,13 +686,13 @@ static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO, // not have a second operand. Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) { // Don't modify shared select instructions - if (!SI->hasOneUse()) return 0; + if (!SI->hasOneUse()) return nullptr; Value *TV = SI->getOperand(1); Value *FV = SI->getOperand(2); if (isa<Constant>(TV) || isa<Constant>(FV)) { // Bool selects with constant operands can be folded to logical ops. - if (SI->getType()->isIntegerTy(1)) return 0; + if (SI->getType()->isIntegerTy(1)) return nullptr; // If it's a bitcast involving vectors, make sure it has the same number of // elements on both sides. @@ -599,10 +701,10 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) { VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy()); // Verify that either both or neither are vectors. - if ((SrcTy == NULL) != (DestTy == NULL)) return 0; + if ((SrcTy == nullptr) != (DestTy == nullptr)) return nullptr; // If vectors, verify that they have the same number of elements. if (SrcTy && SrcTy->getNumElements() != DestTy->getNumElements()) - return 0; + return nullptr; } Value *SelectTrueVal = FoldOperationIntoSelectOperand(Op, TV, this); @@ -611,7 +713,7 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) { return SelectInst::Create(SI->getCondition(), SelectTrueVal, SelectFalseVal); } - return 0; + return nullptr; } @@ -623,18 +725,17 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { PHINode *PN = cast<PHINode>(I.getOperand(0)); unsigned NumPHIValues = PN->getNumIncomingValues(); if (NumPHIValues == 0) - return 0; + return nullptr; // We normally only transform phis with a single use. However, if a PHI has // multiple uses and they are all the same operation, we can fold *all* of the // uses into the PHI. if (!PN->hasOneUse()) { // Walk the use list for the instruction, comparing them to I. - for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end(); - UI != E; ++UI) { - Instruction *User = cast<Instruction>(*UI); - if (User != &I && !I.isIdenticalTo(User)) - return 0; + for (User *U : PN->users()) { + Instruction *UI = cast<Instruction>(U); + if (UI != &I && !I.isIdenticalTo(UI)) + return nullptr; } // Otherwise, we can replace *all* users with the new PHI we form. } @@ -644,14 +745,14 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { // remember the BB it is in. If there is more than one or if *it* is a PHI, // bail out. We don't do arbitrary constant expressions here because moving // their computation can be expensive without a cost model. - BasicBlock *NonConstBB = 0; + BasicBlock *NonConstBB = nullptr; for (unsigned i = 0; i != NumPHIValues; ++i) { Value *InVal = PN->getIncomingValue(i); if (isa<Constant>(InVal) && !isa<ConstantExpr>(InVal)) continue; - if (isa<PHINode>(InVal)) return 0; // Itself a phi. - if (NonConstBB) return 0; // More than one non-const value. + if (isa<PHINode>(InVal)) return nullptr; // Itself a phi. + if (NonConstBB) return nullptr; // More than one non-const value. NonConstBB = PN->getIncomingBlock(i); @@ -659,22 +760,22 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { // insert a computation after it without breaking the edge. if (InvokeInst *II = dyn_cast<InvokeInst>(InVal)) if (II->getParent() == NonConstBB) - return 0; + return nullptr; // If the incoming non-constant value is in I's block, we will remove one // instruction, but insert another equivalent one, leading to infinite // instcombine. if (NonConstBB == I.getParent()) - return 0; + return nullptr; } // If there is exactly one non-constant value, we can insert a copy of the // operation in that block. However, if this is a critical edge, we would be // inserting the computation one some other paths (e.g. inside a loop). Only // do this if the pred block is unconditionally branching into the phi block. - if (NonConstBB != 0) { + if (NonConstBB != nullptr) { BranchInst *BI = dyn_cast<BranchInst>(NonConstBB->getTerminator()); - if (!BI || !BI->isUnconditional()) return 0; + if (!BI || !BI->isUnconditional()) return nullptr; } // Okay, we can do the transformation: create the new PHI node. @@ -698,7 +799,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { BasicBlock *ThisBB = PN->getIncomingBlock(i); Value *TrueVInPred = TrueV->DoPHITranslation(PhiTransBB, ThisBB); Value *FalseVInPred = FalseV->DoPHITranslation(PhiTransBB, ThisBB); - Value *InV = 0; + Value *InV = nullptr; // Beware of ConstantExpr: it may eventually evaluate to getNullValue, // even if currently isNullValue gives false. Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)); @@ -712,7 +813,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { } else if (CmpInst *CI = dyn_cast<CmpInst>(&I)) { Constant *C = cast<Constant>(I.getOperand(1)); for (unsigned i = 0; i != NumPHIValues; ++i) { - Value *InV = 0; + Value *InV = nullptr; if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) InV = ConstantExpr::getCompare(CI->getPredicate(), InC, C); else if (isa<ICmpInst>(CI)) @@ -726,7 +827,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { } else if (I.getNumOperands() == 2) { Constant *C = cast<Constant>(I.getOperand(1)); for (unsigned i = 0; i != NumPHIValues; ++i) { - Value *InV = 0; + Value *InV = nullptr; if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) InV = ConstantExpr::get(I.getOpcode(), InC, C); else @@ -748,8 +849,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { } } - for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end(); - UI != E; ) { + for (auto UI = PN->user_begin(), E = PN->user_end(); UI != E;) { Instruction *User = cast<Instruction>(*UI++); if (User == &I) continue; ReplaceInstUsesWith(*User, NewPN); @@ -766,19 +866,19 @@ Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset, SmallVectorImpl<Value*> &NewIndices) { assert(PtrTy->isPtrOrPtrVectorTy()); - if (!TD) - return 0; + if (!DL) + return nullptr; Type *Ty = PtrTy->getPointerElementType(); if (!Ty->isSized()) - return 0; + return nullptr; // Start with the index over the outer type. Note that the type size // might be zero (even if the offset isn't zero) if the indexed type // is something like [0 x {int, int}] - Type *IntPtrTy = TD->getIntPtrType(PtrTy); + Type *IntPtrTy = DL->getIntPtrType(PtrTy); int64_t FirstIdx = 0; - if (int64_t TySize = TD->getTypeAllocSize(Ty)) { + if (int64_t TySize = DL->getTypeAllocSize(Ty)) { FirstIdx = Offset/TySize; Offset -= FirstIdx*TySize; @@ -796,11 +896,11 @@ Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset, // Index into the types. If we fail, set OrigBase to null. while (Offset) { // Indexing into tail padding between struct/array elements. - if (uint64_t(Offset*8) >= TD->getTypeSizeInBits(Ty)) - return 0; + if (uint64_t(Offset*8) >= DL->getTypeSizeInBits(Ty)) + return nullptr; if (StructType *STy = dyn_cast<StructType>(Ty)) { - const StructLayout *SL = TD->getStructLayout(STy); + const StructLayout *SL = DL->getStructLayout(STy); assert(Offset < (int64_t)SL->getSizeInBytes() && "Offset must stay within the indexed type"); @@ -811,14 +911,14 @@ Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset, Offset -= SL->getElementOffset(Elt); Ty = STy->getElementType(Elt); } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) { - uint64_t EltSize = TD->getTypeAllocSize(AT->getElementType()); + uint64_t EltSize = DL->getTypeAllocSize(AT->getElementType()); assert(EltSize && "Cannot index into a zero-sized array"); NewIndices.push_back(ConstantInt::get(IntPtrTy,Offset/EltSize)); Offset %= EltSize; Ty = AT->getElementType(); } else { // Otherwise, we can't index into the middle of this atomic type, bail. - return 0; + return nullptr; } } @@ -850,7 +950,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { // If Scale is zero then it does not divide Val. if (Scale.isMinValue()) - return 0; + return nullptr; // Look through chains of multiplications, searching for a constant that is // divisible by Scale. For example, descaling X*(Y*(Z*4)) by a factor of 4 @@ -893,7 +993,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { APInt::sdivrem(CI->getValue(), Scale, Quotient, Remainder); if (!Remainder.isMinValue()) // Not divisible by Scale. - return 0; + return nullptr; // Replace with the quotient in the parent. Op = ConstantInt::get(CI->getType(), Quotient); NoSignedWrap = true; @@ -906,7 +1006,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { // Multiplication. NoSignedWrap = BO->hasNoSignedWrap(); if (RequireNoSignedWrap && !NoSignedWrap) - return 0; + return nullptr; // There are three cases for multiplication: multiplication by exactly // the scale, multiplication by a constant different to the scale, and @@ -925,7 +1025,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { // Otherwise drill down into the constant. if (!Op->hasOneUse()) - return 0; + return nullptr; Parent = std::make_pair(BO, 1); continue; @@ -934,7 +1034,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { // Multiplication by something else. Drill down into the left-hand side // since that's where the reassociate pass puts the good stuff. if (!Op->hasOneUse()) - return 0; + return nullptr; Parent = std::make_pair(BO, 0); continue; @@ -945,7 +1045,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { // Multiplication by a power of 2. NoSignedWrap = BO->hasNoSignedWrap(); if (RequireNoSignedWrap && !NoSignedWrap) - return 0; + return nullptr; Value *LHS = BO->getOperand(0); int32_t Amt = cast<ConstantInt>(BO->getOperand(1))-> @@ -959,7 +1059,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { break; } if (Amt < logScale || !Op->hasOneUse()) - return 0; + return nullptr; // Multiplication by more than the scale. Reduce the multiplying amount // by the scale in the parent. @@ -970,7 +1070,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { } if (!Op->hasOneUse()) - return 0; + return nullptr; if (CastInst *Cast = dyn_cast<CastInst>(Op)) { if (Cast->getOpcode() == Instruction::SExt) { @@ -984,7 +1084,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { // Scale and the multiplication Y * SmallScale should not overflow. if (SmallScale.sext(Scale.getBitWidth()) != Scale) // SmallScale does not sign-extend to Scale. - return 0; + return nullptr; assert(SmallScale.exactLogBase2() == logScale); // Require that Y * SmallScale must not overflow. RequireNoSignedWrap = true; @@ -1003,7 +1103,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { // trunc (Y * sext Scale) does not, so nsw flags need to be cleared // from this point up in the expression (see later). if (RequireNoSignedWrap) - return 0; + return nullptr; // Drill down through the cast. unsigned LargeSize = Cast->getSrcTy()->getPrimitiveSizeInBits(); @@ -1017,7 +1117,13 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { } // Unsupported expression, bail out. - return 0; + return nullptr; + } + + // If Op is zero then Val = Op * Scale. + if (match(Op, m_Zero())) { + NoSignedWrap = true; + return Op; } // We know that we can successfully descale, so from here on we can safely @@ -1069,23 +1175,125 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { // Move up one level in the expression. assert(Ancestor->hasOneUse() && "Drilled down when more than one use!"); - Ancestor = Ancestor->use_back(); + Ancestor = Ancestor->user_back(); } while (1); } +/// \brief Creates node of binary operation with the same attributes as the +/// specified one but with other operands. +static Value *CreateBinOpAsGiven(BinaryOperator &Inst, Value *LHS, Value *RHS, + InstCombiner::BuilderTy *B) { + Value *BORes = B->CreateBinOp(Inst.getOpcode(), LHS, RHS); + if (BinaryOperator *NewBO = dyn_cast<BinaryOperator>(BORes)) { + if (isa<OverflowingBinaryOperator>(NewBO)) { + NewBO->setHasNoSignedWrap(Inst.hasNoSignedWrap()); + NewBO->setHasNoUnsignedWrap(Inst.hasNoUnsignedWrap()); + } + if (isa<PossiblyExactOperator>(NewBO)) + NewBO->setIsExact(Inst.isExact()); + } + return BORes; +} + +/// \brief Makes transformation of binary operation specific for vector types. +/// \param Inst Binary operator to transform. +/// \return Pointer to node that must replace the original binary operator, or +/// null pointer if no transformation was made. +Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) { + if (!Inst.getType()->isVectorTy()) return nullptr; + + // It may not be safe to reorder shuffles and things like div, urem, etc. + // because we may trap when executing those ops on unknown vector elements. + // See PR20059. + if (!isSafeToSpeculativelyExecute(&Inst, DL)) return nullptr; + + unsigned VWidth = cast<VectorType>(Inst.getType())->getNumElements(); + Value *LHS = Inst.getOperand(0), *RHS = Inst.getOperand(1); + assert(cast<VectorType>(LHS->getType())->getNumElements() == VWidth); + assert(cast<VectorType>(RHS->getType())->getNumElements() == VWidth); + + // If both arguments of binary operation are shuffles, which use the same + // mask and shuffle within a single vector, it is worthwhile to move the + // shuffle after binary operation: + // Op(shuffle(v1, m), shuffle(v2, m)) -> shuffle(Op(v1, v2), m) + if (isa<ShuffleVectorInst>(LHS) && isa<ShuffleVectorInst>(RHS)) { + ShuffleVectorInst *LShuf = cast<ShuffleVectorInst>(LHS); + ShuffleVectorInst *RShuf = cast<ShuffleVectorInst>(RHS); + if (isa<UndefValue>(LShuf->getOperand(1)) && + isa<UndefValue>(RShuf->getOperand(1)) && + LShuf->getOperand(0)->getType() == RShuf->getOperand(0)->getType() && + LShuf->getMask() == RShuf->getMask()) { + Value *NewBO = CreateBinOpAsGiven(Inst, LShuf->getOperand(0), + RShuf->getOperand(0), Builder); + Value *Res = Builder->CreateShuffleVector(NewBO, + UndefValue::get(NewBO->getType()), LShuf->getMask()); + return Res; + } + } + + // If one argument is a shuffle within one vector, the other is a constant, + // try moving the shuffle after the binary operation. + ShuffleVectorInst *Shuffle = nullptr; + Constant *C1 = nullptr; + if (isa<ShuffleVectorInst>(LHS)) Shuffle = cast<ShuffleVectorInst>(LHS); + if (isa<ShuffleVectorInst>(RHS)) Shuffle = cast<ShuffleVectorInst>(RHS); + if (isa<Constant>(LHS)) C1 = cast<Constant>(LHS); + if (isa<Constant>(RHS)) C1 = cast<Constant>(RHS); + if (Shuffle && C1 && + (isa<ConstantVector>(C1) || isa<ConstantDataVector>(C1)) && + isa<UndefValue>(Shuffle->getOperand(1)) && + Shuffle->getType() == Shuffle->getOperand(0)->getType()) { + SmallVector<int, 16> ShMask = Shuffle->getShuffleMask(); + // Find constant C2 that has property: + // shuffle(C2, ShMask) = C1 + // If such constant does not exist (example: ShMask=<0,0> and C1=<1,2>) + // reorder is not possible. + SmallVector<Constant*, 16> C2M(VWidth, + UndefValue::get(C1->getType()->getScalarType())); + bool MayChange = true; + for (unsigned I = 0; I < VWidth; ++I) { + if (ShMask[I] >= 0) { + assert(ShMask[I] < (int)VWidth); + if (!isa<UndefValue>(C2M[ShMask[I]])) { + MayChange = false; + break; + } + C2M[ShMask[I]] = C1->getAggregateElement(I); + } + } + if (MayChange) { + Constant *C2 = ConstantVector::get(C2M); + Value *NewLHS, *NewRHS; + if (isa<Constant>(LHS)) { + NewLHS = C2; + NewRHS = Shuffle->getOperand(0); + } else { + NewLHS = Shuffle->getOperand(0); + NewRHS = C2; + } + Value *NewBO = CreateBinOpAsGiven(Inst, NewLHS, NewRHS, Builder); + Value *Res = Builder->CreateShuffleVector(NewBO, + UndefValue::get(Inst.getType()), Shuffle->getMask()); + return Res; + } + } + + return nullptr; +} + Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end()); - if (Value *V = SimplifyGEPInst(Ops, TD)) + if (Value *V = SimplifyGEPInst(Ops, DL)) return ReplaceInstUsesWith(GEP, V); Value *PtrOp = GEP.getOperand(0); // Eliminate unneeded casts for indices, and replace indices which displace // by multiples of a zero size type with zero. - if (TD) { + if (DL) { bool MadeChange = false; - Type *IntPtrTy = TD->getIntPtrType(GEP.getPointerOperandType()); + Type *IntPtrTy = DL->getIntPtrType(GEP.getPointerOperandType()); gep_type_iterator GTI = gep_type_begin(GEP); for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); @@ -1097,7 +1305,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // If the element type has zero size then any index over it is equivalent // to an index of zero, so replace it with zero if it is not zero already. if (SeqTy->getElementType()->isSized() && - TD->getTypeAllocSize(SeqTy->getElementType()) == 0) + DL->getTypeAllocSize(SeqTy->getElementType()) == 0) if (!isa<Constant>(*I) || !cast<Constant>(*I)->isNullValue()) { *I = Constant::getNullValue(IntPtrTy); MadeChange = true; @@ -1115,13 +1323,98 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (MadeChange) return &GEP; } + // Check to see if the inputs to the PHI node are getelementptr instructions. + if (PHINode *PN = dyn_cast<PHINode>(PtrOp)) { + GetElementPtrInst *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0)); + if (!Op1) + return nullptr; + + signed DI = -1; + + for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) { + GetElementPtrInst *Op2 = dyn_cast<GetElementPtrInst>(*I); + if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands()) + return nullptr; + + // Keep track of the type as we walk the GEP. + Type *CurTy = Op1->getOperand(0)->getType()->getScalarType(); + + for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) { + if (Op1->getOperand(J)->getType() != Op2->getOperand(J)->getType()) + return nullptr; + + if (Op1->getOperand(J) != Op2->getOperand(J)) { + if (DI == -1) { + // We have not seen any differences yet in the GEPs feeding the + // PHI yet, so we record this one if it is allowed to be a + // variable. + + // The first two arguments can vary for any GEP, the rest have to be + // static for struct slots + if (J > 1 && CurTy->isStructTy()) + return nullptr; + + DI = J; + } else { + // The GEP is different by more than one input. While this could be + // extended to support GEPs that vary by more than one variable it + // doesn't make sense since it greatly increases the complexity and + // would result in an R+R+R addressing mode which no backend + // directly supports and would need to be broken into several + // simpler instructions anyway. + return nullptr; + } + } + + // Sink down a layer of the type for the next iteration. + if (J > 0) { + if (CompositeType *CT = dyn_cast<CompositeType>(CurTy)) { + CurTy = CT->getTypeAtIndex(Op1->getOperand(J)); + } else { + CurTy = nullptr; + } + } + } + } + + GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(Op1->clone()); + + if (DI == -1) { + // All the GEPs feeding the PHI are identical. Clone one down into our + // BB so that it can be merged with the current GEP. + GEP.getParent()->getInstList().insert(GEP.getParent()->getFirstNonPHI(), + NewGEP); + } else { + // All the GEPs feeding the PHI differ at a single offset. Clone a GEP + // into the current block so it can be merged, and create a new PHI to + // set that index. + Instruction *InsertPt = Builder->GetInsertPoint(); + Builder->SetInsertPoint(PN); + PHINode *NewPN = Builder->CreatePHI(Op1->getOperand(DI)->getType(), + PN->getNumOperands()); + Builder->SetInsertPoint(InsertPt); + + for (auto &I : PN->operands()) + NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI), + PN->getIncomingBlock(I)); + + NewGEP->setOperand(DI, NewPN); + GEP.getParent()->getInstList().insert(GEP.getParent()->getFirstNonPHI(), + NewGEP); + NewGEP->setOperand(DI, NewPN); + } + + GEP.setOperand(0, NewGEP); + PtrOp = NewGEP; + } + // Combine Indices - If the source pointer to this getelementptr instruction // is a getelementptr instruction, combine the indices of the two // getelementptr instructions into a single instruction. // if (GEPOperator *Src = dyn_cast<GEPOperator>(PtrOp)) { if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src)) - return 0; + return nullptr; // Note that if our source is a gep chain itself then we wait for that // chain to be resolved before we perform this transformation. This @@ -1129,7 +1422,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (GEPOperator *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0))) if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP)) - return 0; // Wait until our source is folded to completion. + return nullptr; // Wait until our source is folded to completion. SmallVector<Value*, 8> Indices; @@ -1157,7 +1450,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // intptr_t). Just avoid transforming this until the input has been // normalized. if (SO1->getType() != GO1->getType()) - return 0; + return nullptr; Sum = Builder->CreateAdd(SO1, GO1, PtrOp->getName()+".sum"); } @@ -1188,12 +1481,12 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Canonicalize (gep i8* X, -(ptrtoint Y)) to (sub (ptrtoint X), (ptrtoint Y)) // The GEP pattern is emitted by the SCEV expander for certain kinds of // pointer arithmetic. - if (TD && GEP.getNumIndices() == 1 && + if (DL && GEP.getNumIndices() == 1 && match(GEP.getOperand(1), m_Neg(m_PtrToInt(m_Value())))) { unsigned AS = GEP.getPointerAddressSpace(); if (GEP.getType() == Builder->getInt8PtrTy(AS) && GEP.getOperand(1)->getType()->getScalarSizeInBits() == - TD->getPointerSizeInBits(AS)) { + DL->getPointerSizeInBits(AS)) { Operator *Index = cast<Operator>(GEP.getOperand(1)); Value *PtrToInt = Builder->CreatePtrToInt(PtrOp, Index->getType()); Value *NewSub = Builder->CreateSub(PtrToInt, Index->getOperand(1)); @@ -1207,11 +1500,9 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // We do not handle pointer-vector geps here. if (!StrippedPtrTy) - return 0; - - if (StrippedPtr != PtrOp && - StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) { + return nullptr; + if (StrippedPtr != PtrOp) { bool HasZeroPointerIndex = false; if (ConstantInt *C = dyn_cast<ConstantInt>(GEP.getOperand(1))) HasZeroPointerIndex = C->isZero(); @@ -1234,7 +1525,15 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { GetElementPtrInst *Res = GetElementPtrInst::Create(StrippedPtr, Idx, GEP.getName()); Res->setIsInBounds(GEP.isInBounds()); - return Res; + if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace()) + return Res; + // Insert Res, and create an addrspacecast. + // e.g., + // GEP (addrspacecast i8 addrspace(1)* X to [0 x i8]*), i32 0, ... + // -> + // %0 = GEP i8 addrspace(1)* X, ... + // addrspacecast i8 addrspace(1)* %0 to i8* + return new AddrSpaceCastInst(Builder->Insert(Res), GEP.getType()); } if (ArrayType *XATy = @@ -1246,8 +1545,24 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // to an array of the same type as the destination pointer // array. Because the array type is never stepped over (there // is a leading zero) we can fold the cast into this GEP. - GEP.setOperand(0, StrippedPtr); - return &GEP; + if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace()) { + GEP.setOperand(0, StrippedPtr); + return &GEP; + } + // Cannot replace the base pointer directly because StrippedPtr's + // address space is different. Instead, create a new GEP followed by + // an addrspacecast. + // e.g., + // GEP (addrspacecast [10 x i8] addrspace(1)* X to [0 x i8]*), + // i32 0, ... + // -> + // %0 = GEP [10 x i8] addrspace(1)* X, ... + // addrspacecast i8 addrspace(1)* %0 to i8* + SmallVector<Value*, 8> Idx(GEP.idx_begin(), GEP.idx_end()); + Value *NewGEP = GEP.isInBounds() ? + Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()) : + Builder->CreateGEP(StrippedPtr, Idx, GEP.getName()); + return new AddrSpaceCastInst(NewGEP, GEP.getType()); } } } @@ -1257,27 +1572,29 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // into: %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast Type *SrcElTy = StrippedPtrTy->getElementType(); Type *ResElTy = PtrOp->getType()->getPointerElementType(); - if (TD && SrcElTy->isArrayTy() && - TD->getTypeAllocSize(SrcElTy->getArrayElementType()) == - TD->getTypeAllocSize(ResElTy)) { - Type *IdxType = TD->getIntPtrType(GEP.getType()); + if (DL && SrcElTy->isArrayTy() && + DL->getTypeAllocSize(SrcElTy->getArrayElementType()) == + DL->getTypeAllocSize(ResElTy)) { + Type *IdxType = DL->getIntPtrType(GEP.getType()); Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) }; Value *NewGEP = GEP.isInBounds() ? Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()) : Builder->CreateGEP(StrippedPtr, Idx, GEP.getName()); + // V and GEP are both pointer types --> BitCast - return new BitCastInst(NewGEP, GEP.getType()); + return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, + GEP.getType()); } // Transform things like: // %V = mul i64 %N, 4 // %t = getelementptr i8* bitcast (i32* %arr to i8*), i32 %V // into: %t1 = getelementptr i32* %arr, i32 %N; bitcast - if (TD && ResElTy->isSized() && SrcElTy->isSized()) { + if (DL && ResElTy->isSized() && SrcElTy->isSized()) { // Check that changing the type amounts to dividing the index by a scale // factor. - uint64_t ResSize = TD->getTypeAllocSize(ResElTy); - uint64_t SrcSize = TD->getTypeAllocSize(SrcElTy); + uint64_t ResSize = DL->getTypeAllocSize(ResElTy); + uint64_t SrcSize = DL->getTypeAllocSize(SrcElTy); if (ResSize && SrcSize % ResSize == 0) { Value *Idx = GEP.getOperand(1); unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits(); @@ -1285,7 +1602,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Earlier transforms ensure that the index has type IntPtrType, which // considerably simplifies the logic by eliminating implicit casts. - assert(Idx->getType() == TD->getIntPtrType(GEP.getType()) && + assert(Idx->getType() == DL->getIntPtrType(GEP.getType()) && "Index not cast to pointer width?"); bool NSW; @@ -1296,8 +1613,10 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { Value *NewGEP = GEP.isInBounds() && NSW ? Builder->CreateInBoundsGEP(StrippedPtr, NewIdx, GEP.getName()) : Builder->CreateGEP(StrippedPtr, NewIdx, GEP.getName()); + // The NewGEP must be pointer typed, so must the old one -> BitCast - return new BitCastInst(NewGEP, GEP.getType()); + return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, + GEP.getType()); } } } @@ -1306,13 +1625,13 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp // (where tmp = 8*tmp2) into: // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast - if (TD && ResElTy->isSized() && SrcElTy->isSized() && + if (DL && ResElTy->isSized() && SrcElTy->isSized() && SrcElTy->isArrayTy()) { // Check that changing to the array element type amounts to dividing the // index by a scale factor. - uint64_t ResSize = TD->getTypeAllocSize(ResElTy); + uint64_t ResSize = DL->getTypeAllocSize(ResElTy); uint64_t ArrayEltSize - = TD->getTypeAllocSize(SrcElTy->getArrayElementType()); + = DL->getTypeAllocSize(SrcElTy->getArrayElementType()); if (ResSize && ArrayEltSize % ResSize == 0) { Value *Idx = GEP.getOperand(1); unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits(); @@ -1320,7 +1639,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Earlier transforms ensure that the index has type IntPtrType, which // considerably simplifies the logic by eliminating implicit casts. - assert(Idx->getType() == TD->getIntPtrType(GEP.getType()) && + assert(Idx->getType() == DL->getIntPtrType(GEP.getType()) && "Index not cast to pointer width?"); bool NSW; @@ -1329,7 +1648,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // If the multiplication NewIdx * Scale may overflow then the new // GEP may not be "inbounds". Value *Off[2] = { - Constant::getNullValue(TD->getIntPtrType(GEP.getType())), + Constant::getNullValue(DL->getIntPtrType(GEP.getType())), NewIdx }; @@ -1337,15 +1656,16 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { Builder->CreateInBoundsGEP(StrippedPtr, Off, GEP.getName()) : Builder->CreateGEP(StrippedPtr, Off, GEP.getName()); // The NewGEP must be pointer typed, so must the old one -> BitCast - return new BitCastInst(NewGEP, GEP.getType()); + return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, + GEP.getType()); } } } } } - if (!TD) - return 0; + if (!DL) + return nullptr; /// See if we can simplify: /// X = bitcast A* to B* @@ -1355,10 +1675,10 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) { Value *Operand = BCI->getOperand(0); PointerType *OpType = cast<PointerType>(Operand->getType()); - unsigned OffsetBits = TD->getPointerTypeSizeInBits(OpType); + unsigned OffsetBits = DL->getPointerTypeSizeInBits(OpType); APInt Offset(OffsetBits, 0); if (!isa<BitCastInst>(Operand) && - GEP.accumulateConstantOffset(*TD, Offset) && + GEP.accumulateConstantOffset(*DL, Offset) && StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) { // If this GEP instruction doesn't move the pointer, just replace the GEP @@ -1397,7 +1717,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { } } - return 0; + return nullptr; } static bool @@ -1408,9 +1728,8 @@ isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users, do { Instruction *PI = Worklist.pop_back_val(); - for (Value::use_iterator UI = PI->use_begin(), UE = PI->use_end(); UI != UE; - ++UI) { - Instruction *I = cast<Instruction>(*UI); + for (User *U : PI->users()) { + Instruction *I = cast<Instruction>(U); switch (I->getOpcode()) { default: // Give up the moment we see something we can't handle. @@ -1513,7 +1832,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) { } return EraseInstFromFunction(MI); } - return 0; + return nullptr; } /// \brief Move the call to free before a NULL test. @@ -1542,30 +1861,30 @@ tryToMoveFreeBeforeNullTest(CallInst &FI) { // would duplicate the call to free in each predecessor and it may // not be profitable even for code size. if (!PredBB) - return 0; + return nullptr; // Validate constraint #2: Does this block contains only the call to // free and an unconditional branch? // FIXME: We could check if we can speculate everything in the // predecessor block if (FreeInstrBB->size() != 2) - return 0; + return nullptr; BasicBlock *SuccBB; if (!match(FreeInstrBB->getTerminator(), m_UnconditionalBr(SuccBB))) - return 0; + return nullptr; // Validate the rest of constraint #1 by matching on the pred branch. TerminatorInst *TI = PredBB->getTerminator(); BasicBlock *TrueBB, *FalseBB; ICmpInst::Predicate Pred; if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Op), m_Zero()), TrueBB, FalseBB))) - return 0; + return nullptr; if (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE) - return 0; + return nullptr; // Validate constraint #3: Ensure the null case just falls through. if (SuccBB != (Pred == ICmpInst::ICMP_EQ ? TrueBB : FalseBB)) - return 0; + return nullptr; assert(FreeInstrBB == (Pred == ICmpInst::ICMP_EQ ? FalseBB : TrueBB) && "Broken CFG: missing edge from predecessor to successor"); @@ -1600,14 +1919,14 @@ Instruction *InstCombiner::visitFree(CallInst &FI) { if (Instruction *I = tryToMoveFreeBeforeNullTest(FI)) return I; - return 0; + return nullptr; } Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { // Change br (not X), label True, label False to: br X, label False, True - Value *X = 0; + Value *X = nullptr; BasicBlock *TrueDest; BasicBlock *FalseDest; if (match(&BI, m_Br(m_Not(m_Value(X)), TrueDest, FalseDest)) && @@ -1618,7 +1937,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { return &BI; } - // Cannonicalize fcmp_one -> fcmp_oeq + // Canonicalize fcmp_one -> fcmp_oeq FCmpInst::Predicate FPred; Value *Y; if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)), TrueDest, FalseDest)) && @@ -1634,7 +1953,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { return &BI; } - // Cannonicalize icmp_ne -> icmp_eq + // Canonicalize icmp_ne -> icmp_eq ICmpInst::Predicate IPred; if (match(&BI, m_Br(m_ICmp(IPred, m_Value(X), m_Value(Y)), TrueDest, FalseDest)) && @@ -1650,7 +1969,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { return &BI; } - return 0; + return nullptr; } Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) { @@ -1674,7 +1993,7 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) { return &SI; } } - return 0; + return nullptr; } Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { @@ -1691,7 +2010,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { // first index return ExtractValueInst::Create(C2, EV.getIndices().slice(1)); } - return 0; // Can't handle other constants + return nullptr; // Can't handle other constants } if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) { @@ -1824,7 +2143,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { // and if again single-use then via load (gep (gep)) to load (gep). // However, double extracts from e.g. function arguments or return values // aren't handled yet. - return 0; + return nullptr; } enum Personality_Type { @@ -1880,7 +2199,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { // Simplify the list of clauses, eg by removing repeated catch clauses // (these are often created by inlining). bool MakeNewInstruction = false; // If true, recreate using the following: - SmallVector<Value *, 16> NewClauses; // - Clauses for the new instruction; + SmallVector<Constant *, 16> NewClauses; // - Clauses for the new instruction; bool CleanupFlag = LI.isCleanup(); // - The new instruction is a cleanup. SmallPtrSet<Value *, 16> AlreadyCaught; // Typeinfos known caught already. @@ -1888,8 +2207,8 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { bool isLastClause = i + 1 == e; if (LI.isCatch(i)) { // A catch clause. - Value *CatchClause = LI.getClause(i); - Constant *TypeInfo = cast<Constant>(CatchClause->stripPointerCasts()); + Constant *CatchClause = LI.getClause(i); + Constant *TypeInfo = CatchClause->stripPointerCasts(); // If we already saw this clause, there is no point in having a second // copy of it. @@ -1918,7 +2237,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { // equal (for example if one represents a C++ class, and the other some // class derived from it). assert(LI.isFilter(i) && "Unsupported landingpad clause!"); - Value *FilterClause = LI.getClause(i); + Constant *FilterClause = LI.getClause(i); ArrayType *FilterType = cast<ArrayType>(FilterClause->getType()); unsigned NumTypeInfos = FilterType->getNumElements(); @@ -1962,8 +2281,8 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { // catch-alls. If so, the filter can be discarded. bool SawCatchAll = false; for (unsigned j = 0; j != NumTypeInfos; ++j) { - Value *Elt = Filter->getOperand(j); - Constant *TypeInfo = cast<Constant>(Elt->stripPointerCasts()); + Constant *Elt = Filter->getOperand(j); + Constant *TypeInfo = Elt->stripPointerCasts(); if (isCatchAll(Personality, TypeInfo)) { // This element is a catch-all. Bail out, noting this fact. SawCatchAll = true; @@ -2068,7 +2387,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { continue; // If Filter is a subset of LFilter, i.e. every element of Filter is also // an element of LFilter, then discard LFilter. - SmallVectorImpl<Value *>::iterator J = NewClauses.begin() + j; + SmallVectorImpl<Constant *>::iterator J = NewClauses.begin() + j; // If Filter is empty then it is a subset of LFilter. if (!FElts) { // Discard LFilter. @@ -2163,7 +2482,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { return &LI; } - return 0; + return nullptr; } @@ -2214,7 +2533,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { static bool AddReachableCodeToWorklist(BasicBlock *BB, SmallPtrSet<BasicBlock*, 64> &Visited, InstCombiner &IC, - const DataLayout *TD, + const DataLayout *DL, const TargetLibraryInfo *TLI) { bool MadeIRChange = false; SmallVector<BasicBlock*, 256> Worklist; @@ -2242,7 +2561,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, // ConstantProp instruction if trivially constant. if (!Inst->use_empty() && isa<Constant>(Inst->getOperand(0))) - if (Constant *C = ConstantFoldInstruction(Inst, TD, TLI)) { + if (Constant *C = ConstantFoldInstruction(Inst, DL, TLI)) { DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *Inst << '\n'); Inst->replaceAllUsesWith(C); @@ -2251,16 +2570,16 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, continue; } - if (TD) { + if (DL) { // See if we can constant fold its operands. for (User::op_iterator i = Inst->op_begin(), e = Inst->op_end(); i != e; ++i) { ConstantExpr *CE = dyn_cast<ConstantExpr>(i); - if (CE == 0) continue; + if (CE == nullptr) continue; Constant*& FoldRes = FoldedConstants[CE]; if (!FoldRes) - FoldRes = ConstantFoldConstantExpression(CE, TD, TLI); + FoldRes = ConstantFoldConstantExpression(CE, DL, TLI); if (!FoldRes) FoldRes = CE; @@ -2327,7 +2646,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { // the reachable instructions. Ignore blocks that are not reachable. Keep // track of which blocks we visit. SmallPtrSet<BasicBlock*, 64> Visited; - MadeIRChange |= AddReachableCodeToWorklist(F.begin(), Visited, *this, TD, + MadeIRChange |= AddReachableCodeToWorklist(F.begin(), Visited, *this, DL, TLI); // Do a quick scan over the function. If we find any blocks that are @@ -2360,7 +2679,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { while (!Worklist.isEmpty()) { Instruction *I = Worklist.RemoveOne(); - if (I == 0) continue; // skip null values. + if (I == nullptr) continue; // skip null values. // Check to see if we can DCE the instruction. if (isInstructionTriviallyDead(I, TLI)) { @@ -2373,7 +2692,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { // Instruction isn't dead, see if we can constant propagate it. if (!I->use_empty() && isa<Constant>(I->getOperand(0))) - if (Constant *C = ConstantFoldInstruction(I, TD, TLI)) { + if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) { DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n'); // Add operands to the worklist. @@ -2387,12 +2706,12 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { // See if we can trivially sink this instruction to a successor basic block. if (I->hasOneUse()) { BasicBlock *BB = I->getParent(); - Instruction *UserInst = cast<Instruction>(I->use_back()); + Instruction *UserInst = cast<Instruction>(*I->user_begin()); BasicBlock *UserParent; // Get the block the use occurs in. if (PHINode *PN = dyn_cast<PHINode>(UserInst)) - UserParent = PN->getIncomingBlock(I->use_begin().getUse()); + UserParent = PN->getIncomingBlock(*I->use_begin()); else UserParent = UserInst->getParent(); @@ -2408,9 +2727,18 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { // If the user is one of our immediate successors, and if that successor // only has us as a predecessors (we'd have to split the critical edge // otherwise), we can keep going. - if (UserIsSuccessor && UserParent->getSinglePredecessor()) + if (UserIsSuccessor && UserParent->getSinglePredecessor()) { // Okay, the CFG is simple enough, try to sink this instruction. - MadeIRChange |= TryToSinkInstruction(I, UserParent); + if (TryToSinkInstruction(I, UserParent)) { + MadeIRChange = true; + // We'll add uses of the sunk instruction below, but since sinking + // can expose opportunities for it's *operands* add them to the + // worklist + for (Use &U : I->operands()) + if (Instruction *OpI = dyn_cast<Instruction>(U.get())) + Worklist.Add(OpI); + } + } } } @@ -2482,23 +2810,27 @@ namespace { class InstCombinerLibCallSimplifier : public LibCallSimplifier { InstCombiner *IC; public: - InstCombinerLibCallSimplifier(const DataLayout *TD, + InstCombinerLibCallSimplifier(const DataLayout *DL, const TargetLibraryInfo *TLI, InstCombiner *IC) - : LibCallSimplifier(TD, TLI, UnsafeFPShrink) { + : LibCallSimplifier(DL, TLI, UnsafeFPShrink) { this->IC = IC; } /// replaceAllUsesWith - override so that instruction replacement /// can be defined in terms of the instruction combiner framework. - virtual void replaceAllUsesWith(Instruction *I, Value *With) const { + void replaceAllUsesWith(Instruction *I, Value *With) const override { IC->ReplaceInstUsesWith(*I, With); } }; } bool InstCombiner::runOnFunction(Function &F) { - TD = getAnalysisIfAvailable<DataLayout>(); + if (skipOptnoneFunction(F)) + return false; + + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = &getAnalysis<TargetLibraryInfo>(); // Minimizing size? MinimizeSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, @@ -2507,11 +2839,11 @@ bool InstCombiner::runOnFunction(Function &F) { /// Builder - This is an IRBuilder that automatically inserts new /// instructions into the worklist when they are created. IRBuilder<true, TargetFolder, InstCombineIRInserter> - TheBuilder(F.getContext(), TargetFolder(TD), + TheBuilder(F.getContext(), TargetFolder(DL), InstCombineIRInserter(Worklist)); Builder = &TheBuilder; - InstCombinerLibCallSimplifier TheSimplifier(TD, TLI, this); + InstCombinerLibCallSimplifier TheSimplifier(DL, TLI, this); Simplifier = &TheSimplifier; bool EverMadeChange = false; @@ -2525,7 +2857,7 @@ bool InstCombiner::runOnFunction(Function &F) { while (DoOneIteration(F, Iteration++)) EverMadeChange = true; - Builder = 0; + Builder = nullptr; return EverMadeChange; } diff --git a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index d731ec5..124ffe2 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -13,52 +13,55 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "asan" - #include "llvm/Transforms/Instrumentation.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/OwningPtr.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" -#include "llvm/DIBuilder.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" -#include "llvm/InstVisitor.h" -#include "llvm/Support/CallSite.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Endian.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/system_error.h" +#include "llvm/Transforms/Utils/ASanStackFrameLayout.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ModuleUtils.h" -#include "llvm/Transforms/Utils/SpecialCaseList.h" #include <algorithm> #include <string> +#include <system_error> using namespace llvm; +#define DEBUG_TYPE "asan" + static const uint64_t kDefaultShadowScale = 3; static const uint64_t kDefaultShadowOffset32 = 1ULL << 29; +static const uint64_t kIOSShadowOffset32 = 1ULL << 30; static const uint64_t kDefaultShadowOffset64 = 1ULL << 44; -static const uint64_t kDefaultShort64bitShadowOffset = 0x7FFF8000; // < 2G. +static const uint64_t kSmallX86_64ShadowOffset = 0x7FFF8000; // < 2G. static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41; static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa8000; +static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30; +static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46; static const size_t kMinStackMallocSize = 1 << 6; // 64B static const size_t kMaxStackMallocSize = 1 << 16; // 64K @@ -67,7 +70,7 @@ static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E; static const char *const kAsanModuleCtorName = "asan.module_ctor"; static const char *const kAsanModuleDtorName = "asan.module_dtor"; -static const int kAsanCtorAndCtorPriority = 1; +static const int kAsanCtorAndDtorPriority = 1; static const char *const kAsanReportErrorTemplate = "__asan_report_"; static const char *const kAsanReportLoadN = "__asan_report_load_n"; static const char *const kAsanReportStoreN = "__asan_report_store_n"; @@ -76,11 +79,12 @@ static const char *const kAsanUnregisterGlobalsName = "__asan_unregister_globals"; static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init"; static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init"; -static const char *const kAsanInitName = "__asan_init_v3"; +static const char *const kAsanInitName = "__asan_init_v4"; +static const char *const kAsanCovModuleInitName = "__sanitizer_cov_module_init"; static const char *const kAsanCovName = "__sanitizer_cov"; +static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp"; +static const char *const kAsanPtrSub = "__sanitizer_ptr_sub"; static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return"; -static const char *const kAsanMappingOffsetName = "__asan_mapping_offset"; -static const char *const kAsanMappingScaleName = "__asan_mapping_scale"; static const int kMaxAsanStackMallocSizeClass = 10; static const char *const kAsanStackMallocNameTemplate = "__asan_stack_malloc_"; static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_"; @@ -93,11 +97,6 @@ static const char *const kAsanUnpoisonStackMemoryName = static const char *const kAsanOptionDetectUAR = "__asan_option_detect_stack_use_after_return"; -// These constants must match the definitions in the run-time library. -static const int kAsanStackLeftRedzoneMagic = 0xf1; -static const int kAsanStackMidRedzoneMagic = 0xf2; -static const int kAsanStackRightRedzoneMagic = 0xf3; -static const int kAsanStackPartialRedzoneMagic = 0xf4; #ifndef NDEBUG static const int kAsanStackAfterReturnMagic = 0xf5; #endif @@ -129,23 +128,36 @@ static cl::opt<int> ClMaxInsnsToInstrumentPerBB("asan-max-ins-per-bb", // This flag may need to be replaced with -f[no]asan-stack. static cl::opt<bool> ClStack("asan-stack", cl::desc("Handle stack memory"), cl::Hidden, cl::init(true)); -// This flag may need to be replaced with -f[no]asan-use-after-return. static cl::opt<bool> ClUseAfterReturn("asan-use-after-return", - cl::desc("Check return-after-free"), cl::Hidden, cl::init(false)); + cl::desc("Check return-after-free"), cl::Hidden, cl::init(true)); // This flag may need to be replaced with -f[no]asan-globals. static cl::opt<bool> ClGlobals("asan-globals", cl::desc("Handle global objects"), cl::Hidden, cl::init(true)); -static cl::opt<bool> ClCoverage("asan-coverage", - cl::desc("ASan coverage"), cl::Hidden, cl::init(false)); +static cl::opt<int> ClCoverage("asan-coverage", + cl::desc("ASan coverage. 0: none, 1: entry block, 2: all blocks"), + cl::Hidden, cl::init(false)); +static cl::opt<int> ClCoverageBlockThreshold("asan-coverage-block-threshold", + cl::desc("Add coverage instrumentation only to the entry block if there " + "are more than this number of blocks."), + cl::Hidden, cl::init(1500)); static cl::opt<bool> ClInitializers("asan-initialization-order", - cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(false)); -static cl::opt<bool> ClMemIntrin("asan-memintrin", - cl::desc("Handle memset/memcpy/memmove"), cl::Hidden, cl::init(true)); -static cl::opt<bool> ClRealignStack("asan-realign-stack", - cl::desc("Realign stack to 32"), cl::Hidden, cl::init(true)); -static cl::opt<std::string> ClBlacklistFile("asan-blacklist", - cl::desc("File containing the list of objects to ignore " - "during instrumentation"), cl::Hidden); + cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(true)); +static cl::opt<bool> ClInvalidPointerPairs("asan-detect-invalid-pointer-pair", + cl::desc("Instrument <, <=, >, >=, - with pointer operands"), + cl::Hidden, cl::init(false)); +static cl::opt<unsigned> ClRealignStack("asan-realign-stack", + cl::desc("Realign stack to the value of this flag (power of two)"), + cl::Hidden, cl::init(32)); +static cl::opt<int> ClInstrumentationWithCallsThreshold( + "asan-instrumentation-with-call-threshold", + cl::desc("If the function being instrumented contains more than " + "this number of memory accesses, use callbacks instead of " + "inline checks (-1 means never use callbacks)."), + cl::Hidden, cl::init(7000)); +static cl::opt<std::string> ClMemoryAccessCallbackPrefix( + "asan-memory-access-callback-prefix", + cl::desc("Prefix for memory access callbacks"), cl::Hidden, + cl::init("__asan_")); // This is an experimental feature that will allow to choose between // instrumented and non-instrumented code at link-time. @@ -165,11 +177,6 @@ static cl::opt<bool> ClKeepUninstrumented("asan-keep-uninstrumented-functions", // Shadow = (Mem >> scale) + (1 << offset_log) static cl::opt<int> ClMappingScale("asan-mapping-scale", cl::desc("scale of asan shadow mapping"), cl::Hidden, cl::init(0)); -static cl::opt<int> ClMappingOffsetLog("asan-mapping-offset-log", - cl::desc("offset of asan shadow mapping"), cl::Hidden, cl::init(-1)); -static cl::opt<bool> ClShort64BitOffset("asan-short-64bit-mapping-offset", - cl::desc("Use short immediate constant as the mapping offset for 64bit"), - cl::Hidden, cl::init(true)); // Optimization flags. Not user visible, used mostly for testing // and benchmarking the tool. @@ -205,29 +212,86 @@ STATISTIC(NumOptimizedAccessesToGlobalVar, "Number of optimized accesses to global vars"); namespace { -/// A set of dynamically initialized globals extracted from metadata. -class SetOfDynamicallyInitializedGlobals { +/// Frontend-provided metadata for global variables. +class GlobalsMetadata { public: - void Init(Module& M) { - // Clang generates metadata identifying all dynamically initialized globals. - NamedMDNode *DynamicGlobals = - M.getNamedMetadata("llvm.asan.dynamically_initialized_globals"); - if (!DynamicGlobals) + struct Entry { + Entry() + : SourceLoc(nullptr), Name(nullptr), IsDynInit(false), + IsBlacklisted(false) {} + GlobalVariable *SourceLoc; + GlobalVariable *Name; + bool IsDynInit; + bool IsBlacklisted; + }; + + GlobalsMetadata() : inited_(false) {} + + void init(Module& M) { + assert(!inited_); + inited_ = true; + NamedMDNode *Globals = M.getNamedMetadata("llvm.asan.globals"); + if (!Globals) return; - for (int i = 0, n = DynamicGlobals->getNumOperands(); i < n; ++i) { - MDNode *MDN = DynamicGlobals->getOperand(i); - assert(MDN->getNumOperands() == 1); - Value *VG = MDN->getOperand(0); - // The optimizer may optimize away a global entirely, in which case we - // cannot instrument access to it. - if (!VG) + for (auto MDN : Globals->operands()) { + // Metadata node contains the global and the fields of "Entry". + assert(MDN->getNumOperands() == 5); + Value *V = MDN->getOperand(0); + // The optimizer may optimize away a global entirely. + if (!V) continue; - DynInitGlobals.insert(cast<GlobalVariable>(VG)); + GlobalVariable *GV = cast<GlobalVariable>(V); + // We can already have an entry for GV if it was merged with another + // global. + Entry &E = Entries[GV]; + if (Value *Loc = MDN->getOperand(1)) { + GlobalVariable *GVLoc = cast<GlobalVariable>(Loc); + E.SourceLoc = GVLoc; + addSourceLocationGlobal(GVLoc); + } + if (Value *Name = MDN->getOperand(2)) { + GlobalVariable *GVName = cast<GlobalVariable>(Name); + E.Name = GVName; + InstrumentationGlobals.insert(GVName); + } + ConstantInt *IsDynInit = cast<ConstantInt>(MDN->getOperand(3)); + E.IsDynInit |= IsDynInit->isOne(); + ConstantInt *IsBlacklisted = cast<ConstantInt>(MDN->getOperand(4)); + E.IsBlacklisted |= IsBlacklisted->isOne(); } } - bool Contains(GlobalVariable *G) { return DynInitGlobals.count(G) != 0; } + + /// Returns metadata entry for a given global. + Entry get(GlobalVariable *G) const { + auto Pos = Entries.find(G); + return (Pos != Entries.end()) ? Pos->second : Entry(); + } + + /// Check if the global was generated by the instrumentation + /// (we don't want to instrument it again in this case). + bool isInstrumentationGlobal(GlobalVariable *G) const { + return InstrumentationGlobals.count(G); + } + private: - SmallSet<GlobalValue*, 32> DynInitGlobals; + bool inited_; + DenseMap<GlobalVariable*, Entry> Entries; + // Globals generated by the frontend instrumentation. + DenseSet<GlobalVariable*> InstrumentationGlobals; + + void addSourceLocationGlobal(GlobalVariable *SourceLocGV) { + // Source location global is a struct with layout: + // { + // filename, + // i32 line_number, + // i32 column_number, + // } + InstrumentationGlobals.insert(SourceLocGV); + ConstantStruct *Contents = + cast<ConstantStruct>(SourceLocGV->getInitializer()); + GlobalVariable *FilenameGV = cast<GlobalVariable>(Contents->getOperand(0)); + InstrumentationGlobals.insert(FilenameGV); + } }; /// This struct defines the shadow mapping using the rule: @@ -238,11 +302,12 @@ struct ShadowMapping { bool OrShadowOffset; }; -static ShadowMapping getShadowMapping(const Module &M, int LongSize, - bool ZeroBaseShadow) { +static ShadowMapping getShadowMapping(const Module &M, int LongSize) { llvm::Triple TargetTriple(M.getTargetTriple()); bool IsAndroid = TargetTriple.getEnvironment() == llvm::Triple::Android; - bool IsMacOSX = TargetTriple.getOS() == llvm::Triple::MacOSX; + bool IsIOS = TargetTriple.getOS() == llvm::Triple::IOS; + bool IsFreeBSD = TargetTriple.getOS() == llvm::Triple::FreeBSD; + bool IsLinux = TargetTriple.getOS() == llvm::Triple::Linux; bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64 || TargetTriple.getArch() == llvm::Triple::ppc64le; bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64; @@ -251,22 +316,26 @@ static ShadowMapping getShadowMapping(const Module &M, int LongSize, ShadowMapping Mapping; - // OR-ing shadow offset if more efficient (at least on x86), - // but on ppc64 we have to use add since the shadow offset is not neccesary - // 1/8-th of the address space. - Mapping.OrShadowOffset = !IsPPC64 && !ClShort64BitOffset; - - Mapping.Offset = (IsAndroid || ZeroBaseShadow) ? 0 : - (LongSize == 32 ? - (IsMIPS32 ? kMIPS32_ShadowOffset32 : kDefaultShadowOffset32) : - IsPPC64 ? kPPC64_ShadowOffset64 : kDefaultShadowOffset64); - if (!ZeroBaseShadow && ClShort64BitOffset && IsX86_64 && !IsMacOSX) { - assert(LongSize == 64); - Mapping.Offset = kDefaultShort64bitShadowOffset; - } - if (!ZeroBaseShadow && ClMappingOffsetLog >= 0) { - // Zero offset log is the special case. - Mapping.Offset = (ClMappingOffsetLog == 0) ? 0 : 1ULL << ClMappingOffsetLog; + if (LongSize == 32) { + if (IsAndroid) + Mapping.Offset = 0; + else if (IsMIPS32) + Mapping.Offset = kMIPS32_ShadowOffset32; + else if (IsFreeBSD) + Mapping.Offset = kFreeBSD_ShadowOffset32; + else if (IsIOS) + Mapping.Offset = kIOSShadowOffset32; + else + Mapping.Offset = kDefaultShadowOffset32; + } else { // LongSize == 64 + if (IsPPC64) + Mapping.Offset = kPPC64_ShadowOffset64; + else if (IsFreeBSD) + Mapping.Offset = kFreeBSD_ShadowOffset64; + else if (IsLinux && IsX86_64) + Mapping.Offset = kSmallX86_64ShadowOffset; + else + Mapping.Offset = kDefaultShadowOffset64; } Mapping.Scale = kDefaultShadowScale; @@ -274,6 +343,11 @@ static ShadowMapping getShadowMapping(const Module &M, int LongSize, Mapping.Scale = ClMappingScale; } + // OR-ing shadow offset if more efficient (at least on x86) if the offset + // is a power of two, but on ppc64 we have to use add since the shadow + // offset is not necessary 1/8-th of the address space. + Mapping.OrShadowOffset = !IsPPC64 && !(Mapping.Offset & (Mapping.Offset - 1)); + return Mapping; } @@ -285,58 +359,37 @@ static size_t RedzoneSizeForScale(int MappingScale) { /// AddressSanitizer: instrument the code in module to find memory bugs. struct AddressSanitizer : public FunctionPass { - AddressSanitizer(bool CheckInitOrder = true, - bool CheckUseAfterReturn = false, - bool CheckLifetime = false, - StringRef BlacklistFile = StringRef(), - bool ZeroBaseShadow = false) - : FunctionPass(ID), - CheckInitOrder(CheckInitOrder || ClInitializers), - CheckUseAfterReturn(CheckUseAfterReturn || ClUseAfterReturn), - CheckLifetime(CheckLifetime || ClCheckLifetime), - BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile - : BlacklistFile), - ZeroBaseShadow(ZeroBaseShadow) {} - virtual const char *getPassName() const { + AddressSanitizer() : FunctionPass(ID) {} + const char *getPassName() const override { return "AddressSanitizerFunctionPass"; } - void instrumentMop(Instruction *I); + void instrumentMop(Instruction *I, bool UseCalls); + void instrumentPointerComparisonOrSubtraction(Instruction *I); void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore, Value *Addr, uint32_t TypeSize, bool IsWrite, - Value *SizeArgument); + Value *SizeArgument, bool UseCalls); Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong, Value *ShadowValue, uint32_t TypeSize); Instruction *generateCrashCode(Instruction *InsertBefore, Value *Addr, bool IsWrite, size_t AccessSizeIndex, Value *SizeArgument); - bool instrumentMemIntrinsic(MemIntrinsic *MI); - void instrumentMemIntrinsicParam(Instruction *OrigIns, Value *Addr, - Value *Size, - Instruction *InsertBefore, bool IsWrite); + void instrumentMemIntrinsic(MemIntrinsic *MI); Value *memToShadow(Value *Shadow, IRBuilder<> &IRB); - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; bool maybeInsertAsanInitAtFunctionEntry(Function &F); - void emitShadowMapping(Module &M, IRBuilder<> &IRB) const; - virtual bool doInitialization(Module &M); + bool doInitialization(Module &M) override; static char ID; // Pass identification, replacement for typeid private: void initializeCallbacks(Module &M); - bool ShouldInstrumentGlobal(GlobalVariable *G); bool LooksLikeCodeInBug11395(Instruction *I); - void FindDynamicInitializers(Module &M); bool GlobalIsLinkerInitialized(GlobalVariable *G); - bool InjectCoverage(Function &F); - - bool CheckInitOrder; - bool CheckUseAfterReturn; - bool CheckLifetime; - SmallString<64> BlacklistFile; - bool ZeroBaseShadow; + bool InjectCoverage(Function &F, const ArrayRef<BasicBlock*> AllBlocks); + void InjectCoverageAtBlock(Function &F, BasicBlock &BB); LLVMContext *C; - DataLayout *TD; + const DataLayout *DL; int LongSize; Type *IntptrTy; ShadowMapping Mapping; @@ -344,56 +397,50 @@ struct AddressSanitizer : public FunctionPass { Function *AsanInitFunction; Function *AsanHandleNoReturnFunc; Function *AsanCovFunction; - OwningPtr<SpecialCaseList> BL; + Function *AsanPtrCmpFunction, *AsanPtrSubFunction; // This array is indexed by AccessIsWrite and log2(AccessSize). Function *AsanErrorCallback[2][kNumberOfAccessSizes]; + Function *AsanMemoryAccessCallback[2][kNumberOfAccessSizes]; // This array is indexed by AccessIsWrite. - Function *AsanErrorCallbackSized[2]; + Function *AsanErrorCallbackSized[2], + *AsanMemoryAccessCallbackSized[2]; + Function *AsanMemmove, *AsanMemcpy, *AsanMemset; InlineAsm *EmptyAsm; - SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals; + GlobalsMetadata GlobalsMD; friend struct FunctionStackPoisoner; }; class AddressSanitizerModule : public ModulePass { public: - AddressSanitizerModule(bool CheckInitOrder = true, - StringRef BlacklistFile = StringRef(), - bool ZeroBaseShadow = false) - : ModulePass(ID), - CheckInitOrder(CheckInitOrder || ClInitializers), - BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile - : BlacklistFile), - ZeroBaseShadow(ZeroBaseShadow) {} - bool runOnModule(Module &M); + AddressSanitizerModule() : ModulePass(ID) {} + bool runOnModule(Module &M) override; static char ID; // Pass identification, replacement for typeid - virtual const char *getPassName() const { + const char *getPassName() const override { return "AddressSanitizerModule"; } private: void initializeCallbacks(Module &M); + bool InstrumentGlobals(IRBuilder<> &IRB, Module &M); bool ShouldInstrumentGlobal(GlobalVariable *G); + void poisonOneInitializer(Function &GlobalInit, GlobalValue *ModuleName); void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName); - size_t RedzoneSize() const { + size_t MinRedzoneSizeForGlobal() const { return RedzoneSizeForScale(Mapping.Scale); } - bool CheckInitOrder; - SmallString<64> BlacklistFile; - bool ZeroBaseShadow; - - OwningPtr<SpecialCaseList> BL; - SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals; + GlobalsMetadata GlobalsMD; Type *IntptrTy; LLVMContext *C; - DataLayout *TD; + const DataLayout *DL; ShadowMapping Mapping; Function *AsanPoisonGlobals; Function *AsanUnpoisonGlobals; Function *AsanRegisterGlobals; Function *AsanUnregisterGlobals; + Function *AsanCovModuleInit; }; // Stack poisoning does not play well with exception handling. @@ -416,7 +463,6 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { SmallVector<AllocaInst*, 16> AllocaVec; SmallVector<Instruction*, 8> RetVec; - uint64_t TotalStackSize; unsigned StackAlignment; Function *AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1], @@ -440,16 +486,14 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { : F(F), ASan(ASan), DIB(*F.getParent()), C(ASan.C), IntptrTy(ASan.IntptrTy), IntptrPtrTy(PointerType::get(IntptrTy, 0)), Mapping(ASan.Mapping), - TotalStackSize(0), StackAlignment(1 << Mapping.Scale) {} + StackAlignment(1 << Mapping.Scale) {} bool runOnFunction() { if (!ClStack) return false; // Collect alloca, ret, lifetime instructions etc. - for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()), - DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) { - BasicBlock *BB = *DI; + for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB); - } + if (AllocaVec.empty()) return false; initializeCallbacks(*F.getParent()); @@ -479,14 +523,12 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { StackAlignment = std::max(StackAlignment, AI.getAlignment()); AllocaVec.push_back(&AI); - uint64_t AlignedSize = getAlignedAllocaSize(&AI); - TotalStackSize += AlignedSize; } /// \brief Collect lifetime intrinsic calls to check for use-after-scope /// errors. void visitIntrinsicInst(IntrinsicInst &II) { - if (!ASan.CheckLifetime) return; + if (!ClCheckLifetime) return; Intrinsic::ID ID = II.getIntrinsicID(); if (ID != Intrinsic::lifetime_start && ID != Intrinsic::lifetime_end) @@ -514,31 +556,20 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { // Check if we want (and can) handle this alloca. bool isInterestingAlloca(AllocaInst &AI) const { - return (!AI.isArrayAllocation() && - AI.isStaticAlloca() && - AI.getAlignment() <= RedzoneSize() && - AI.getAllocatedType()->isSized()); + return (!AI.isArrayAllocation() && AI.isStaticAlloca() && + AI.getAllocatedType()->isSized() && + // alloca() may be called with 0 size, ignore it. + getAllocaSizeInBytes(&AI) > 0); } - size_t RedzoneSize() const { - return RedzoneSizeForScale(Mapping.Scale); - } uint64_t getAllocaSizeInBytes(AllocaInst *AI) const { Type *Ty = AI->getAllocatedType(); - uint64_t SizeInBytes = ASan.TD->getTypeAllocSize(Ty); + uint64_t SizeInBytes = ASan.DL->getTypeAllocSize(Ty); return SizeInBytes; } - uint64_t getAlignedSize(uint64_t SizeInBytes) const { - size_t RZ = RedzoneSize(); - return ((SizeInBytes + RZ - 1) / RZ) * RZ; - } - uint64_t getAlignedAllocaSize(AllocaInst *AI) const { - uint64_t SizeInBytes = getAllocaSizeInBytes(AI); - return getAlignedSize(SizeInBytes); - } /// Finds alloca where the value comes from. AllocaInst *findAllocaForValue(Value *V); - void poisonRedZones(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> &IRB, + void poisonRedZones(const ArrayRef<uint8_t> ShadowBytes, IRBuilder<> &IRB, Value *ShadowBase, bool DoPoison); void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> &IRB, bool DoPoison); @@ -552,21 +583,16 @@ char AddressSanitizer::ID = 0; INITIALIZE_PASS(AddressSanitizer, "asan", "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false, false) -FunctionPass *llvm::createAddressSanitizerFunctionPass( - bool CheckInitOrder, bool CheckUseAfterReturn, bool CheckLifetime, - StringRef BlacklistFile, bool ZeroBaseShadow) { - return new AddressSanitizer(CheckInitOrder, CheckUseAfterReturn, - CheckLifetime, BlacklistFile, ZeroBaseShadow); +FunctionPass *llvm::createAddressSanitizerFunctionPass() { + return new AddressSanitizer(); } char AddressSanitizerModule::ID = 0; INITIALIZE_PASS(AddressSanitizerModule, "asan-module", "AddressSanitizer: detects use-after-free and out-of-bounds bugs." "ModulePass", false, false) -ModulePass *llvm::createAddressSanitizerModulePass( - bool CheckInitOrder, StringRef BlacklistFile, bool ZeroBaseShadow) { - return new AddressSanitizerModule(CheckInitOrder, BlacklistFile, - ZeroBaseShadow); +ModulePass *llvm::createAddressSanitizerModulePass() { + return new AddressSanitizerModule(); } static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { @@ -576,12 +602,16 @@ static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { } // \brief Create a constant for Str so that we can pass it to the run-time lib. -static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) { +static GlobalVariable *createPrivateGlobalForString( + Module &M, StringRef Str, bool AllowMerging) { Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str); - GlobalVariable *GV = new GlobalVariable(M, StrConst->getType(), true, - GlobalValue::InternalLinkage, StrConst, - kAsanGenPrefix); - GV->setUnnamedAddr(true); // Ok to merge these. + // We use private linkage for module-local strings. If they can be merged + // with another one, we set the unnamed_addr attribute. + GlobalVariable *GV = + new GlobalVariable(M, StrConst->getType(), true, + GlobalValue::PrivateLinkage, StrConst, kAsanGenPrefix); + if (AllowMerging) + GV->setUnnamedAddr(true); GV->setAlignment(1); // Strings may not be merged w/o setting align 1. return GV; } @@ -602,90 +632,111 @@ Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) { return IRB.CreateAdd(Shadow, ConstantInt::get(IntptrTy, Mapping.Offset)); } -void AddressSanitizer::instrumentMemIntrinsicParam( - Instruction *OrigIns, - Value *Addr, Value *Size, Instruction *InsertBefore, bool IsWrite) { - IRBuilder<> IRB(InsertBefore); - if (Size->getType() != IntptrTy) - Size = IRB.CreateIntCast(Size, IntptrTy, false); - // Check the first byte. - instrumentAddress(OrigIns, InsertBefore, Addr, 8, IsWrite, Size); - // Check the last byte. - IRB.SetInsertPoint(InsertBefore); - Value *SizeMinusOne = IRB.CreateSub(Size, ConstantInt::get(IntptrTy, 1)); - Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); - Value *AddrLast = IRB.CreateAdd(AddrLong, SizeMinusOne); - instrumentAddress(OrigIns, InsertBefore, AddrLast, 8, IsWrite, Size); -} - // Instrument memset/memmove/memcpy -bool AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) { - Value *Dst = MI->getDest(); - MemTransferInst *MemTran = dyn_cast<MemTransferInst>(MI); - Value *Src = MemTran ? MemTran->getSource() : 0; - Value *Length = MI->getLength(); - - Constant *ConstLength = dyn_cast<Constant>(Length); - Instruction *InsertBefore = MI; - if (ConstLength) { - if (ConstLength->isNullValue()) return false; - } else { - // The size is not a constant so it could be zero -- check at run-time. - IRBuilder<> IRB(InsertBefore); - - Value *Cmp = IRB.CreateICmpNE(Length, - Constant::getNullValue(Length->getType())); - InsertBefore = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false); +void AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) { + IRBuilder<> IRB(MI); + if (isa<MemTransferInst>(MI)) { + IRB.CreateCall3( + isa<MemMoveInst>(MI) ? AsanMemmove : AsanMemcpy, + IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), + IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()), + IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)); + } else if (isa<MemSetInst>(MI)) { + IRB.CreateCall3( + AsanMemset, + IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), + IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false), + IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)); } - - instrumentMemIntrinsicParam(MI, Dst, Length, InsertBefore, true); - if (Src) - instrumentMemIntrinsicParam(MI, Src, Length, InsertBefore, false); - return true; + MI->eraseFromParent(); } // If I is an interesting memory access, return the PointerOperand -// and set IsWrite. Otherwise return NULL. -static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite) { +// and set IsWrite/Alignment. Otherwise return NULL. +static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite, + unsigned *Alignment) { + // Skip memory accesses inserted by another instrumentation. + if (I->getMetadata("nosanitize")) + return nullptr; if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - if (!ClInstrumentReads) return NULL; + if (!ClInstrumentReads) return nullptr; *IsWrite = false; + *Alignment = LI->getAlignment(); return LI->getPointerOperand(); } if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - if (!ClInstrumentWrites) return NULL; + if (!ClInstrumentWrites) return nullptr; *IsWrite = true; + *Alignment = SI->getAlignment(); return SI->getPointerOperand(); } if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { - if (!ClInstrumentAtomics) return NULL; + if (!ClInstrumentAtomics) return nullptr; *IsWrite = true; + *Alignment = 0; return RMW->getPointerOperand(); } if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) { - if (!ClInstrumentAtomics) return NULL; + if (!ClInstrumentAtomics) return nullptr; *IsWrite = true; + *Alignment = 0; return XCHG->getPointerOperand(); } - return NULL; + return nullptr; +} + +static bool isPointerOperand(Value *V) { + return V->getType()->isPointerTy() || isa<PtrToIntInst>(V); +} + +// This is a rough heuristic; it may cause both false positives and +// false negatives. The proper implementation requires cooperation with +// the frontend. +static bool isInterestingPointerComparisonOrSubtraction(Instruction *I) { + if (ICmpInst *Cmp = dyn_cast<ICmpInst>(I)) { + if (!Cmp->isRelational()) + return false; + } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { + if (BO->getOpcode() != Instruction::Sub) + return false; + } else { + return false; + } + if (!isPointerOperand(I->getOperand(0)) || + !isPointerOperand(I->getOperand(1))) + return false; + return true; } bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) { // If a global variable does not have dynamic initialization we don't // have to instrument it. However, if a global does not have initializer // at all, we assume it has dynamic initializer (in other TU). - return G->hasInitializer() && !DynamicallyInitializedGlobals.Contains(G); + return G->hasInitializer() && !GlobalsMD.get(G).IsDynInit; } -void AddressSanitizer::instrumentMop(Instruction *I) { +void +AddressSanitizer::instrumentPointerComparisonOrSubtraction(Instruction *I) { + IRBuilder<> IRB(I); + Function *F = isa<ICmpInst>(I) ? AsanPtrCmpFunction : AsanPtrSubFunction; + Value *Param[2] = {I->getOperand(0), I->getOperand(1)}; + for (int i = 0; i < 2; i++) { + if (Param[i]->getType()->isPointerTy()) + Param[i] = IRB.CreatePointerCast(Param[i], IntptrTy); + } + IRB.CreateCall2(F, Param[0], Param[1]); +} + +void AddressSanitizer::instrumentMop(Instruction *I, bool UseCalls) { bool IsWrite = false; - Value *Addr = isInterestingMemoryAccess(I, &IsWrite); + unsigned Alignment = 0; + Value *Addr = isInterestingMemoryAccess(I, &IsWrite, &Alignment); assert(Addr); if (ClOpt && ClOptGlobals) { if (GlobalVariable *G = dyn_cast<GlobalVariable>(Addr)) { // If initialization order checking is disabled, a simple access to a // dynamically initialized global is always valid. - if (!CheckInitOrder || GlobalIsLinkerInitialized(G)) { + if (!ClInitializers || GlobalIsLinkerInitialized(G)) { NumOptimizedAccessesToGlobalVar++; return; } @@ -705,7 +756,7 @@ void AddressSanitizer::instrumentMop(Instruction *I) { Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType(); assert(OrigTy->isSized()); - uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy); + uint32_t TypeSize = DL->getTypeStoreSizeInBits(OrigTy); assert((TypeSize % 8) == 0); @@ -714,22 +765,29 @@ void AddressSanitizer::instrumentMop(Instruction *I) { else NumInstrumentedReads++; - // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check. - if (TypeSize == 8 || TypeSize == 16 || - TypeSize == 32 || TypeSize == 64 || TypeSize == 128) - return instrumentAddress(I, I, Addr, TypeSize, IsWrite, 0); - // Instrument unusual size (but still multiple of 8). + unsigned Granularity = 1 << Mapping.Scale; + // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check + // if the data is properly aligned. + if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 || + TypeSize == 128) && + (Alignment >= Granularity || Alignment == 0 || Alignment >= TypeSize / 8)) + return instrumentAddress(I, I, Addr, TypeSize, IsWrite, nullptr, UseCalls); + // Instrument unusual size or unusual alignment. // We can not do it with a single check, so we do 1-byte check for the first // and the last bytes. We call __asan_report_*_n(addr, real_size) to be able // to report the actual access size. IRBuilder<> IRB(I); - Value *LastByte = IRB.CreateIntToPtr( - IRB.CreateAdd(IRB.CreatePointerCast(Addr, IntptrTy), - ConstantInt::get(IntptrTy, TypeSize / 8 - 1)), - OrigPtrTy); Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8); - instrumentAddress(I, I, Addr, 8, IsWrite, Size); - instrumentAddress(I, I, LastByte, 8, IsWrite, Size); + Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); + if (UseCalls) { + IRB.CreateCall2(AsanMemoryAccessCallbackSized[IsWrite], AddrLong, Size); + } else { + Value *LastByte = IRB.CreateIntToPtr( + IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)), + OrigPtrTy); + instrumentAddress(I, I, Addr, 8, IsWrite, Size, false); + instrumentAddress(I, I, LastByte, 8, IsWrite, Size, false); + } } // Validate the result of Module::getOrInsertFunction called for an interface @@ -777,11 +835,18 @@ Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong, } void AddressSanitizer::instrumentAddress(Instruction *OrigIns, - Instruction *InsertBefore, - Value *Addr, uint32_t TypeSize, - bool IsWrite, Value *SizeArgument) { + Instruction *InsertBefore, Value *Addr, + uint32_t TypeSize, bool IsWrite, + Value *SizeArgument, bool UseCalls) { IRBuilder<> IRB(InsertBefore); Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); + size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize); + + if (UseCalls) { + IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][AccessSizeIndex], + AddrLong); + return; + } Type *ShadowTy = IntegerType::get( *C, std::max(8U, TypeSize >> Mapping.Scale)); @@ -792,13 +857,12 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy)); Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal); - size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize); size_t Granularity = 1 << Mapping.Scale; - TerminatorInst *CrashTerm = 0; + TerminatorInst *CrashTerm = nullptr; if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) { TerminatorInst *CheckTerm = - SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false); + SplitBlockAndInsertIfThen(Cmp, InsertBefore, false); assert(dyn_cast<BranchInst>(CheckTerm)->isUnconditional()); BasicBlock *NextBB = CheckTerm->getSuccessor(0); IRB.SetInsertPoint(CheckTerm); @@ -809,7 +873,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2); ReplaceInstWithInst(CheckTerm, NewTerm); } else { - CrashTerm = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), true); + CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, true); } Instruction *Crash = generateCrashCode( @@ -817,27 +881,36 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, Crash->setDebugLoc(OrigIns->getDebugLoc()); } -void AddressSanitizerModule::createInitializerPoisonCalls( - Module &M, GlobalValue *ModuleName) { - // We do all of our poisoning and unpoisoning within _GLOBAL__I_a. - Function *GlobalInit = M.getFunction("_GLOBAL__I_a"); - // If that function is not present, this TU contains no globals, or they have - // all been optimized away - if (!GlobalInit) - return; - +void AddressSanitizerModule::poisonOneInitializer(Function &GlobalInit, + GlobalValue *ModuleName) { // Set up the arguments to our poison/unpoison functions. - IRBuilder<> IRB(GlobalInit->begin()->getFirstInsertionPt()); + IRBuilder<> IRB(GlobalInit.begin()->getFirstInsertionPt()); // Add a call to poison all external globals before the given function starts. Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy); IRB.CreateCall(AsanPoisonGlobals, ModuleNameAddr); // Add calls to unpoison all globals before each return instruction. - for (Function::iterator I = GlobalInit->begin(), E = GlobalInit->end(); - I != E; ++I) { - if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator())) { + for (auto &BB : GlobalInit.getBasicBlockList()) + if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) CallInst::Create(AsanUnpoisonGlobals, "", RI); +} + +void AddressSanitizerModule::createInitializerPoisonCalls( + Module &M, GlobalValue *ModuleName) { + GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors"); + + ConstantArray *CA = cast<ConstantArray>(GV->getInitializer()); + for (Use &OP : CA->operands()) { + if (isa<ConstantAggregateZero>(OP)) + continue; + ConstantStruct *CS = cast<ConstantStruct>(OP); + + // Must have a function or null ptr. + // (CS->getOperand(0) is the init priority.) + if (Function* F = dyn_cast<Function>(CS->getOperand(1))) { + if (F->getName() != kAsanModuleCtorName) + poisonOneInitializer(*F, ModuleName); } } } @@ -846,23 +919,27 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) { Type *Ty = cast<PointerType>(G->getType())->getElementType(); DEBUG(dbgs() << "GLOBAL: " << *G << "\n"); - if (BL->isIn(*G)) return false; + if (GlobalsMD.get(G).IsBlacklisted) return false; + if (GlobalsMD.isInstrumentationGlobal(G)) return false; if (!Ty->isSized()) return false; if (!G->hasInitializer()) return false; if (GlobalWasGeneratedByAsan(G)) return false; // Our own global. // Touch only those globals that will not be defined in other modules. - // Don't handle ODR type linkages since other modules may be built w/o asan. + // Don't handle ODR linkage types and COMDATs since other modules may be built + // without ASan. if (G->getLinkage() != GlobalVariable::ExternalLinkage && G->getLinkage() != GlobalVariable::PrivateLinkage && G->getLinkage() != GlobalVariable::InternalLinkage) return false; + if (G->hasComdat()) + return false; // Two problems with thread-locals: // - The address of the main thread's copy can't be computed at link-time. // - Need to poison all copies, not just the main thread's one. if (G->isThreadLocal()) return false; - // For now, just ignore this Alloca if the alignment is large. - if (G->getAlignment() > RedzoneSize()) return false; + // For now, just ignore this Global if the alignment is large. + if (G->getAlignment() > MinRedzoneSizeForGlobal()) return false; // Ignore all the globals with the names starting with "\01L_OBJC_". // Many of those are put into the .cstring section. The linker compresses @@ -870,7 +947,7 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) { // our redzones get broken. if ((G->getName().find("\01L_OBJC_") == 0) || (G->getName().find("\01l_OBJC_") == 0)) { - DEBUG(dbgs() << "Ignoring \\01L_OBJC_* global: " << *G); + DEBUG(dbgs() << "Ignoring \\01L_OBJC_* global: " << *G << "\n"); return false; } @@ -879,9 +956,9 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) { // Ignore the globals from the __OBJC section. The ObjC runtime assumes // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to // them. - if ((Section.find("__OBJC,") == 0) || - (Section.find("__DATA, __objc_") == 0)) { - DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G); + if (Section.startswith("__OBJC,") || + Section.startswith("__DATA, __objc_")) { + DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n"); return false; } // See http://code.google.com/p/address-sanitizer/issues/detail?id=32 @@ -892,10 +969,28 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) { // is placed into __DATA,__cfstring // Therefore there's no point in placing redzones into __DATA,__cfstring. // Moreover, it causes the linker to crash on OS X 10.7 - if (Section.find("__DATA,__cfstring") == 0) { - DEBUG(dbgs() << "Ignoring CFString: " << *G); + if (Section.startswith("__DATA,__cfstring")) { + DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n"); + return false; + } + // The linker merges the contents of cstring_literals and removes the + // trailing zeroes. + if (Section.startswith("__TEXT,__cstring,cstring_literals")) { + DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n"); + return false; + } + + // Callbacks put into the CRT initializer/terminator sections + // should not be instrumented. + // See https://code.google.com/p/address-sanitizer/issues/detail?id=305 + // and http://msdn.microsoft.com/en-US/en-en/library/bb918180(v=vs.120).aspx + if (Section.startswith(".CRT")) { + DEBUG(dbgs() << "Ignoring a global initializer callback: " << *G << "\n"); return false; } + + // Globals from llvm.metadata aren't emitted, do not instrument them. + if (Section == "llvm.metadata") return false; } return true; @@ -919,31 +1014,23 @@ void AddressSanitizerModule::initializeCallbacks(Module &M) { kAsanUnregisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage); + AsanCovModuleInit = checkInterfaceFunction(M.getOrInsertFunction( + kAsanCovModuleInitName, + IRB.getVoidTy(), IntptrTy, NULL)); + AsanCovModuleInit->setLinkage(Function::ExternalLinkage); } // This function replaces all global variables with new variables that have // trailing redzones. It also creates a function that poisons // redzones and inserts this function into llvm.global_ctors. -bool AddressSanitizerModule::runOnModule(Module &M) { - if (!ClGlobals) return false; - TD = getAnalysisIfAvailable<DataLayout>(); - if (!TD) - return false; - BL.reset(SpecialCaseList::createOrDie(BlacklistFile)); - if (BL->isIn(M)) return false; - C = &(M.getContext()); - int LongSize = TD->getPointerSizeInBits(); - IntptrTy = Type::getIntNTy(*C, LongSize); - Mapping = getShadowMapping(M, LongSize, ZeroBaseShadow); - initializeCallbacks(M); - DynamicallyInitializedGlobals.Init(M); +bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) { + GlobalsMD.init(M); SmallVector<GlobalVariable *, 16> GlobalsToChange; - for (Module::GlobalListType::iterator G = M.global_begin(), - E = M.global_end(); G != E; ++G) { - if (ShouldInstrumentGlobal(G)) - GlobalsToChange.push_back(G); + for (auto &G : M.globals()) { + if (ShouldInstrumentGlobal(&G)) + GlobalsToChange.push_back(&G); } size_t n = GlobalsToChange.size(); @@ -956,31 +1043,35 @@ bool AddressSanitizerModule::runOnModule(Module &M) { // const char *name; // const char *module_name; // size_t has_dynamic_init; + // void *source_location; // We initialize an array of such structures and pass it to a run-time call. - StructType *GlobalStructTy = StructType::get(IntptrTy, IntptrTy, - IntptrTy, IntptrTy, - IntptrTy, IntptrTy, NULL); + StructType *GlobalStructTy = + StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy, + IntptrTy, IntptrTy, NULL); SmallVector<Constant *, 16> Initializers(n); - Function *CtorFunc = M.getFunction(kAsanModuleCtorName); - assert(CtorFunc); - IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator()); - bool HasDynamicallyInitializedGlobals = false; - GlobalVariable *ModuleName = createPrivateGlobalForString( - M, M.getModuleIdentifier()); // We shouldn't merge same module names, as this string serves as unique // module ID in runtime. - ModuleName->setUnnamedAddr(false); + GlobalVariable *ModuleName = createPrivateGlobalForString( + M, M.getModuleIdentifier(), /*AllowMerging*/false); for (size_t i = 0; i < n; i++) { static const uint64_t kMaxGlobalRedzone = 1 << 18; GlobalVariable *G = GlobalsToChange[i]; + + auto MD = GlobalsMD.get(G); + // Create string holding the global name unless it was provided by + // the metadata. + GlobalVariable *Name = + MD.Name ? MD.Name : createPrivateGlobalForString(M, G->getName(), + /*AllowMerging*/ true); + PointerType *PtrTy = cast<PointerType>(G->getType()); Type *Ty = PtrTy->getElementType(); - uint64_t SizeInBytes = TD->getTypeAllocSize(Ty); - uint64_t MinRZ = RedzoneSize(); + uint64_t SizeInBytes = DL->getTypeAllocSize(Ty); + uint64_t MinRZ = MinRedzoneSizeForGlobal(); // MinRZ <= RZ <= kMaxGlobalRedzone // and trying to make RZ to be ~ 1/4 of SizeInBytes. uint64_t RZ = std::max(MinRZ, @@ -992,19 +1083,12 @@ bool AddressSanitizerModule::runOnModule(Module &M) { RightRedzoneSize += MinRZ - (SizeInBytes % MinRZ); assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0); Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize); - // Determine whether this global should be poisoned in initialization. - bool GlobalHasDynamicInitializer = - DynamicallyInitializedGlobals.Contains(G); - // Don't check initialization order if this global is blacklisted. - GlobalHasDynamicInitializer &= !BL->isIn(*G, "init"); StructType *NewTy = StructType::get(Ty, RightRedZoneTy, NULL); Constant *NewInitializer = ConstantStruct::get( NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy), NULL); - GlobalVariable *Name = createPrivateGlobalForString(M, G->getName()); - // Create a new global variable with enough space for a redzone. GlobalValue::LinkageTypes Linkage = G->getLinkage(); if (G->isConstant() && Linkage == GlobalValue::PrivateLinkage) @@ -1025,17 +1109,17 @@ bool AddressSanitizerModule::runOnModule(Module &M) { G->eraseFromParent(); Initializers[i] = ConstantStruct::get( - GlobalStructTy, - ConstantExpr::getPointerCast(NewGlobal, IntptrTy), + GlobalStructTy, ConstantExpr::getPointerCast(NewGlobal, IntptrTy), ConstantInt::get(IntptrTy, SizeInBytes), ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize), ConstantExpr::getPointerCast(Name, IntptrTy), ConstantExpr::getPointerCast(ModuleName, IntptrTy), - ConstantInt::get(IntptrTy, GlobalHasDynamicInitializer), + ConstantInt::get(IntptrTy, MD.IsDynInit), + MD.SourceLoc ? ConstantExpr::getPointerCast(MD.SourceLoc, IntptrTy) + : ConstantInt::get(IntptrTy, 0), NULL); - // Populate the first and last globals declared in this TU. - if (CheckInitOrder && GlobalHasDynamicInitializer) + if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true; DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n"); @@ -1047,7 +1131,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) { ConstantArray::get(ArrayOfGlobalStructTy, Initializers), ""); // Create calls for poisoning before initializers run and unpoisoning after. - if (CheckInitOrder && HasDynamicallyInitializedGlobals) + if (HasDynamicallyInitializedGlobals) createInitializerPoisonCalls(M, ModuleName); IRB.CreateCall2(AsanRegisterGlobals, IRB.CreatePointerCast(AllGlobals, IntptrTy), @@ -1063,12 +1147,42 @@ bool AddressSanitizerModule::runOnModule(Module &M) { IRB_Dtor.CreateCall2(AsanUnregisterGlobals, IRB.CreatePointerCast(AllGlobals, IntptrTy), ConstantInt::get(IntptrTy, n)); - appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndCtorPriority); + appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority); DEBUG(dbgs() << M); return true; } +bool AddressSanitizerModule::runOnModule(Module &M) { + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + if (!DLP) + return false; + DL = &DLP->getDataLayout(); + C = &(M.getContext()); + int LongSize = DL->getPointerSizeInBits(); + IntptrTy = Type::getIntNTy(*C, LongSize); + Mapping = getShadowMapping(M, LongSize); + initializeCallbacks(M); + + bool Changed = false; + + Function *CtorFunc = M.getFunction(kAsanModuleCtorName); + assert(CtorFunc); + IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator()); + + if (ClCoverage > 0) { + Function *CovFunc = M.getFunction(kAsanCovName); + int nCov = CovFunc ? CovFunc->getNumUses() : 0; + IRB.CreateCall(AsanCovModuleInit, ConstantInt::get(IntptrTy, nCov)); + Changed = true; + } + + if (ClGlobals) + Changed |= InstrumentGlobals(IRB, M); + + return Changed; +} + void AddressSanitizer::initializeCallbacks(Module &M) { IRBuilder<> IRB(*C); // Create __asan_report* callbacks. @@ -1076,12 +1190,16 @@ void AddressSanitizer::initializeCallbacks(Module &M) { for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes; AccessSizeIndex++) { // IsWrite and TypeSize are encoded in the function name. - std::string FunctionName = std::string(kAsanReportErrorTemplate) + + std::string Suffix = (AccessIsWrite ? "store" : "load") + itostr(1 << AccessSizeIndex); - // If we are merging crash callbacks, they have two parameters. AsanErrorCallback[AccessIsWrite][AccessSizeIndex] = - checkInterfaceFunction(M.getOrInsertFunction( - FunctionName, IRB.getVoidTy(), IntptrTy, NULL)); + checkInterfaceFunction( + M.getOrInsertFunction(kAsanReportErrorTemplate + Suffix, + IRB.getVoidTy(), IntptrTy, NULL)); + AsanMemoryAccessCallback[AccessIsWrite][AccessSizeIndex] = + checkInterfaceFunction( + M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + Suffix, + IRB.getVoidTy(), IntptrTy, NULL)); } } AsanErrorCallbackSized[0] = checkInterfaceFunction(M.getOrInsertFunction( @@ -1089,45 +1207,49 @@ void AddressSanitizer::initializeCallbacks(Module &M) { AsanErrorCallbackSized[1] = checkInterfaceFunction(M.getOrInsertFunction( kAsanReportStoreN, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); - AsanHandleNoReturnFunc = checkInterfaceFunction(M.getOrInsertFunction( - kAsanHandleNoReturnName, IRB.getVoidTy(), NULL)); + AsanMemoryAccessCallbackSized[0] = checkInterfaceFunction( + M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + "loadN", + IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); + AsanMemoryAccessCallbackSized[1] = checkInterfaceFunction( + M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + "storeN", + IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); + + AsanMemmove = checkInterfaceFunction(M.getOrInsertFunction( + ClMemoryAccessCallbackPrefix + "memmove", IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, NULL)); + AsanMemcpy = checkInterfaceFunction(M.getOrInsertFunction( + ClMemoryAccessCallbackPrefix + "memcpy", IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, NULL)); + AsanMemset = checkInterfaceFunction(M.getOrInsertFunction( + ClMemoryAccessCallbackPrefix + "memset", IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy, NULL)); + + AsanHandleNoReturnFunc = checkInterfaceFunction( + M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy(), NULL)); AsanCovFunction = checkInterfaceFunction(M.getOrInsertFunction( - kAsanCovName, IRB.getVoidTy(), IntptrTy, NULL)); + kAsanCovName, IRB.getVoidTy(), NULL)); + AsanPtrCmpFunction = checkInterfaceFunction(M.getOrInsertFunction( + kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); + AsanPtrSubFunction = checkInterfaceFunction(M.getOrInsertFunction( + kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); // We insert an empty inline asm after __asan_report* to avoid callback merge. EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), StringRef(""), StringRef(""), /*hasSideEffects=*/true); } -void AddressSanitizer::emitShadowMapping(Module &M, IRBuilder<> &IRB) const { - // Tell the values of mapping offset and scale to the run-time. - GlobalValue *asan_mapping_offset = - new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage, - ConstantInt::get(IntptrTy, Mapping.Offset), - kAsanMappingOffsetName); - // Read the global, otherwise it may be optimized away. - IRB.CreateLoad(asan_mapping_offset, true); - - GlobalValue *asan_mapping_scale = - new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage, - ConstantInt::get(IntptrTy, Mapping.Scale), - kAsanMappingScaleName); - // Read the global, otherwise it may be optimized away. - IRB.CreateLoad(asan_mapping_scale, true); -} - // virtual bool AddressSanitizer::doInitialization(Module &M) { // Initialize the private fields. No one has accessed them before. - TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + if (!DLP) + report_fatal_error("data layout missing"); + DL = &DLP->getDataLayout(); - if (!TD) - return false; - BL.reset(SpecialCaseList::createOrDie(BlacklistFile)); - DynamicallyInitializedGlobals.Init(M); + GlobalsMD.init(M); C = &(M.getContext()); - LongSize = TD->getPointerSizeInBits(); + LongSize = DL->getPointerSizeInBits(); IntptrTy = Type::getIntNTy(*C, LongSize); AsanCtorFunction = Function::Create( @@ -1141,10 +1263,9 @@ bool AddressSanitizer::doInitialization(Module &M) { AsanInitFunction->setLinkage(Function::ExternalLinkage); IRB.CreateCall(AsanInitFunction); - Mapping = getShadowMapping(M, LongSize, ZeroBaseShadow); - emitShadowMapping(M, IRB); + Mapping = getShadowMapping(M, LongSize); - appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndCtorPriority); + appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority); return true; } @@ -1164,9 +1285,44 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) { return false; } +void AddressSanitizer::InjectCoverageAtBlock(Function &F, BasicBlock &BB) { + BasicBlock::iterator IP = BB.getFirstInsertionPt(), BE = BB.end(); + // Skip static allocas at the top of the entry block so they don't become + // dynamic when we split the block. If we used our optimized stack layout, + // then there will only be one alloca and it will come first. + for (; IP != BE; ++IP) { + AllocaInst *AI = dyn_cast<AllocaInst>(IP); + if (!AI || !AI->isStaticAlloca()) + break; + } + + DebugLoc EntryLoc = IP->getDebugLoc().getFnDebugLoc(*C); + IRBuilder<> IRB(IP); + IRB.SetCurrentDebugLocation(EntryLoc); + Type *Int8Ty = IRB.getInt8Ty(); + GlobalVariable *Guard = new GlobalVariable( + *F.getParent(), Int8Ty, false, GlobalValue::PrivateLinkage, + Constant::getNullValue(Int8Ty), "__asan_gen_cov_" + F.getName()); + LoadInst *Load = IRB.CreateLoad(Guard); + Load->setAtomic(Monotonic); + Load->setAlignment(1); + Value *Cmp = IRB.CreateICmpEQ(Constant::getNullValue(Int8Ty), Load); + Instruction *Ins = SplitBlockAndInsertIfThen( + Cmp, IP, false, MDBuilder(*C).createBranchWeights(1, 100000)); + IRB.SetInsertPoint(Ins); + IRB.SetCurrentDebugLocation(EntryLoc); + // We pass &F to __sanitizer_cov. We could avoid this and rely on + // GET_CALLER_PC, but having the PC of the first instruction is just nice. + IRB.CreateCall(AsanCovFunction); + StoreInst *Store = IRB.CreateStore(ConstantInt::get(Int8Ty, 1), Guard); + Store->setAtomic(Monotonic); + Store->setAlignment(1); +} + // Poor man's coverage that works with ASan. // We create a Guard boolean variable with the same linkage -// as the function and inject this code into the entry block: +// as the function and inject this code into the entry block (-asan-coverage=1) +// or all blocks (-asan-coverage=2): // if (*Guard) { // __sanitizer_cov(&F); // *Guard = 1; @@ -1175,38 +1331,29 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) { // in __sanitizer_cov (it's fine to call it more than once). // // This coverage implementation provides very limited data: -// it only tells if a given function was ever executed. -// No counters, no per-basic-block or per-edge data. +// it only tells if a given function (block) was ever executed. +// No counters, no per-edge data. // But for many use cases this is what we need and the added slowdown // is negligible. This simple implementation will probably be obsoleted // by the upcoming Clang-based coverage implementation. // By having it here and now we hope to // a) get the functionality to users earlier and // b) collect usage statistics to help improve Clang coverage design. -bool AddressSanitizer::InjectCoverage(Function &F) { +bool AddressSanitizer::InjectCoverage(Function &F, + const ArrayRef<BasicBlock *> AllBlocks) { if (!ClCoverage) return false; - IRBuilder<> IRB(F.getEntryBlock().getFirstInsertionPt()); - Type *Int8Ty = IRB.getInt8Ty(); - GlobalVariable *Guard = new GlobalVariable( - *F.getParent(), Int8Ty, false, GlobalValue::PrivateLinkage, - Constant::getNullValue(Int8Ty), "__asan_gen_cov_" + F.getName()); - LoadInst *Load = IRB.CreateLoad(Guard); - Load->setAtomic(Monotonic); - Load->setAlignment(1); - Value *Cmp = IRB.CreateICmpEQ(Constant::getNullValue(Int8Ty), Load); - Instruction *Ins = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false); - IRB.SetInsertPoint(Ins); - // We pass &F to __sanitizer_cov. We could avoid this and rely on - // GET_CALLER_PC, but having the PC of the first instruction is just nice. - IRB.CreateCall(AsanCovFunction, IRB.CreatePointerCast(&F, IntptrTy)); - StoreInst *Store = IRB.CreateStore(ConstantInt::get(Int8Ty, 1), Guard); - Store->setAtomic(Monotonic); - Store->setAlignment(1); + + if (ClCoverage == 1 || + (unsigned)ClCoverageBlockThreshold < AllBlocks.size()) { + InjectCoverageAtBlock(F, F.getEntryBlock()); + } else { + for (auto BB : AllBlocks) + InjectCoverageAtBlock(F, *BB); + } return true; } bool AddressSanitizer::runOnFunction(Function &F) { - if (BL->isIn(F)) return false; if (&F == AsanCtorFunction) return false; if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false; DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n"); @@ -1226,28 +1373,35 @@ bool AddressSanitizer::runOnFunction(Function &F) { SmallSet<Value*, 16> TempsToInstrument; SmallVector<Instruction*, 16> ToInstrument; SmallVector<Instruction*, 8> NoReturnCalls; + SmallVector<BasicBlock*, 16> AllBlocks; + SmallVector<Instruction*, 16> PointerComparisonsOrSubtracts; int NumAllocas = 0; bool IsWrite; + unsigned Alignment; // Fill the set of memory operations to instrument. - for (Function::iterator FI = F.begin(), FE = F.end(); - FI != FE; ++FI) { + for (auto &BB : F) { + AllBlocks.push_back(&BB); TempsToInstrument.clear(); int NumInsnsPerBB = 0; - for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); - BI != BE; ++BI) { - if (LooksLikeCodeInBug11395(BI)) return false; - if (Value *Addr = isInterestingMemoryAccess(BI, &IsWrite)) { + for (auto &Inst : BB) { + if (LooksLikeCodeInBug11395(&Inst)) return false; + if (Value *Addr = + isInterestingMemoryAccess(&Inst, &IsWrite, &Alignment)) { if (ClOpt && ClOptSameTemp) { if (!TempsToInstrument.insert(Addr)) continue; // We've seen this temp in the current BB. } - } else if (isa<MemIntrinsic>(BI) && ClMemIntrin) { + } else if (ClInvalidPointerPairs && + isInterestingPointerComparisonOrSubtraction(&Inst)) { + PointerComparisonsOrSubtracts.push_back(&Inst); + continue; + } else if (isa<MemIntrinsic>(Inst)) { // ok, take it. } else { - if (isa<AllocaInst>(BI)) + if (isa<AllocaInst>(Inst)) NumAllocas++; - CallSite CS(BI); + CallSite CS(&Inst); if (CS) { // A call inside BB. TempsToInstrument.clear(); @@ -1256,14 +1410,14 @@ bool AddressSanitizer::runOnFunction(Function &F) { } continue; } - ToInstrument.push_back(BI); + ToInstrument.push_back(&Inst); NumInsnsPerBB++; if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB) break; } } - Function *UninstrumentedDuplicate = 0; + Function *UninstrumentedDuplicate = nullptr; bool LikelyToInstrument = !NoReturnCalls.empty() || !ToInstrument.empty() || (NumAllocas > 0); if (ClKeepUninstrumented && LikelyToInstrument) { @@ -1274,14 +1428,18 @@ bool AddressSanitizer::runOnFunction(Function &F) { F.getParent()->getFunctionList().push_back(UninstrumentedDuplicate); } + bool UseCalls = false; + if (ClInstrumentationWithCallsThreshold >= 0 && + ToInstrument.size() > (unsigned)ClInstrumentationWithCallsThreshold) + UseCalls = true; + // Instrument. int NumInstrumented = 0; - for (size_t i = 0, n = ToInstrument.size(); i != n; i++) { - Instruction *Inst = ToInstrument[i]; + for (auto Inst : ToInstrument) { if (ClDebugMin < 0 || ClDebugMax < 0 || (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) { - if (isInterestingMemoryAccess(Inst, &IsWrite)) - instrumentMop(Inst); + if (isInterestingMemoryAccess(Inst, &IsWrite, &Alignment)) + instrumentMop(Inst, UseCalls); else instrumentMemIntrinsic(cast<MemIntrinsic>(Inst)); } @@ -1293,15 +1451,19 @@ bool AddressSanitizer::runOnFunction(Function &F) { // We must unpoison the stack before every NoReturn call (throw, _exit, etc). // See e.g. http://code.google.com/p/address-sanitizer/issues/detail?id=37 - for (size_t i = 0, n = NoReturnCalls.size(); i != n; i++) { - Instruction *CI = NoReturnCalls[i]; + for (auto CI : NoReturnCalls) { IRBuilder<> IRB(CI); IRB.CreateCall(AsanHandleNoReturnFunc); } + for (auto Inst : PointerComparisonsOrSubtracts) { + instrumentPointerComparisonOrSubtraction(Inst); + NumInstrumented++; + } + bool res = NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty(); - if (InjectCoverage(F)) + if (InjectCoverage(F, AllBlocks)) res = true; DEBUG(dbgs() << "ASAN done instrumenting: " << res << " " << F << "\n"); @@ -1323,32 +1485,6 @@ bool AddressSanitizer::runOnFunction(Function &F) { return res; } -static uint64_t ValueForPoison(uint64_t PoisonByte, size_t ShadowRedzoneSize) { - if (ShadowRedzoneSize == 1) return PoisonByte; - if (ShadowRedzoneSize == 2) return (PoisonByte << 8) + PoisonByte; - if (ShadowRedzoneSize == 4) - return (PoisonByte << 24) + (PoisonByte << 16) + - (PoisonByte << 8) + (PoisonByte); - llvm_unreachable("ShadowRedzoneSize is either 1, 2 or 4"); -} - -static void PoisonShadowPartialRightRedzone(uint8_t *Shadow, - size_t Size, - size_t RZSize, - size_t ShadowGranularity, - uint8_t Magic) { - for (size_t i = 0; i < RZSize; - i+= ShadowGranularity, Shadow++) { - if (i + ShadowGranularity <= Size) { - *Shadow = 0; // fully addressable - } else if (i >= Size) { - *Shadow = Magic; // unaddressable - } else { - *Shadow = Size - i; // first Size-i bytes are addressable - } - } -} - // Workaround for bug 11395: we don't want to instrument stack in functions // with large assembly blobs (32-bit only), otherwise reg alloc may crash. // FIXME: remove once the bug 11395 is fixed. @@ -1378,65 +1514,31 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) { kAsanUnpoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); } -void FunctionStackPoisoner::poisonRedZones( - const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> &IRB, Value *ShadowBase, - bool DoPoison) { - size_t ShadowRZSize = RedzoneSize() >> Mapping.Scale; - assert(ShadowRZSize >= 1 && ShadowRZSize <= 4); - Type *RZTy = Type::getIntNTy(*C, ShadowRZSize * 8); - Type *RZPtrTy = PointerType::get(RZTy, 0); - - Value *PoisonLeft = ConstantInt::get(RZTy, - ValueForPoison(DoPoison ? kAsanStackLeftRedzoneMagic : 0LL, ShadowRZSize)); - Value *PoisonMid = ConstantInt::get(RZTy, - ValueForPoison(DoPoison ? kAsanStackMidRedzoneMagic : 0LL, ShadowRZSize)); - Value *PoisonRight = ConstantInt::get(RZTy, - ValueForPoison(DoPoison ? kAsanStackRightRedzoneMagic : 0LL, ShadowRZSize)); - - // poison the first red zone. - IRB.CreateStore(PoisonLeft, IRB.CreateIntToPtr(ShadowBase, RZPtrTy)); - - // poison all other red zones. - uint64_t Pos = RedzoneSize(); - for (size_t i = 0, n = AllocaVec.size(); i < n; i++) { - AllocaInst *AI = AllocaVec[i]; - uint64_t SizeInBytes = getAllocaSizeInBytes(AI); - uint64_t AlignedSize = getAlignedAllocaSize(AI); - assert(AlignedSize - SizeInBytes < RedzoneSize()); - Value *Ptr = NULL; - - Pos += AlignedSize; - - assert(ShadowBase->getType() == IntptrTy); - if (SizeInBytes < AlignedSize) { - // Poison the partial redzone at right - Ptr = IRB.CreateAdd( - ShadowBase, ConstantInt::get(IntptrTy, - (Pos >> Mapping.Scale) - ShadowRZSize)); - size_t AddressableBytes = RedzoneSize() - (AlignedSize - SizeInBytes); - uint32_t Poison = 0; - if (DoPoison) { - PoisonShadowPartialRightRedzone((uint8_t*)&Poison, AddressableBytes, - RedzoneSize(), - 1ULL << Mapping.Scale, - kAsanStackPartialRedzoneMagic); - Poison = - ASan.TD->isLittleEndian() - ? support::endian::byte_swap<uint32_t, support::little>(Poison) - : support::endian::byte_swap<uint32_t, support::big>(Poison); +void +FunctionStackPoisoner::poisonRedZones(const ArrayRef<uint8_t> ShadowBytes, + IRBuilder<> &IRB, Value *ShadowBase, + bool DoPoison) { + size_t n = ShadowBytes.size(); + size_t i = 0; + // We need to (un)poison n bytes of stack shadow. Poison as many as we can + // using 64-bit stores (if we are on 64-bit arch), then poison the rest + // with 32-bit stores, then with 16-byte stores, then with 8-byte stores. + for (size_t LargeStoreSizeInBytes = ASan.LongSize / 8; + LargeStoreSizeInBytes != 0; LargeStoreSizeInBytes /= 2) { + for (; i + LargeStoreSizeInBytes - 1 < n; i += LargeStoreSizeInBytes) { + uint64_t Val = 0; + for (size_t j = 0; j < LargeStoreSizeInBytes; j++) { + if (ASan.DL->isLittleEndian()) + Val |= (uint64_t)ShadowBytes[i + j] << (8 * j); + else + Val = (Val << 8) | ShadowBytes[i + j]; } - Value *PartialPoison = ConstantInt::get(RZTy, Poison); - IRB.CreateStore(PartialPoison, IRB.CreateIntToPtr(Ptr, RZPtrTy)); + if (!Val) continue; + Value *Ptr = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i)); + Type *StoreTy = Type::getIntNTy(*C, LargeStoreSizeInBytes * 8); + Value *Poison = ConstantInt::get(StoreTy, DoPoison ? Val : 0); + IRB.CreateStore(Poison, IRB.CreateIntToPtr(Ptr, StoreTy->getPointerTo())); } - - // Poison the full redzone at right. - Ptr = IRB.CreateAdd(ShadowBase, - ConstantInt::get(IntptrTy, Pos >> Mapping.Scale)); - bool LastAlloca = (i == AllocaVec.size() - 1); - Value *Poison = LastAlloca ? PoisonRight : PoisonMid; - IRB.CreateStore(Poison, IRB.CreateIntToPtr(Ptr, RZPtrTy)); - - Pos += RedzoneSize(); } } @@ -1467,25 +1569,47 @@ void FunctionStackPoisoner::SetShadowToStackAfterReturnInlined( } } -void FunctionStackPoisoner::poisonStack() { - uint64_t LocalStackSize = TotalStackSize + - (AllocaVec.size() + 1) * RedzoneSize(); +static DebugLoc getFunctionEntryDebugLocation(Function &F) { + for (const auto &Inst : F.getEntryBlock()) + if (!isa<AllocaInst>(Inst)) + return Inst.getDebugLoc(); + return DebugLoc(); +} - bool DoStackMalloc = ASan.CheckUseAfterReturn - && LocalStackSize <= kMaxStackMallocSize; +void FunctionStackPoisoner::poisonStack() { int StackMallocIdx = -1; + DebugLoc EntryDebugLocation = getFunctionEntryDebugLocation(F); assert(AllocaVec.size() > 0); Instruction *InsBefore = AllocaVec[0]; IRBuilder<> IRB(InsBefore); - + IRB.SetCurrentDebugLocation(EntryDebugLocation); + + SmallVector<ASanStackVariableDescription, 16> SVD; + SVD.reserve(AllocaVec.size()); + for (AllocaInst *AI : AllocaVec) { + ASanStackVariableDescription D = { AI->getName().data(), + getAllocaSizeInBytes(AI), + AI->getAlignment(), AI, 0}; + SVD.push_back(D); + } + // Minimal header size (left redzone) is 4 pointers, + // i.e. 32 bytes on 64-bit platforms and 16 bytes in 32-bit platforms. + size_t MinHeaderSize = ASan.LongSize / 2; + ASanStackFrameLayout L; + ComputeASanStackFrameLayout(SVD, 1UL << Mapping.Scale, MinHeaderSize, &L); + DEBUG(dbgs() << L.DescriptionString << " --- " << L.FrameSize << "\n"); + uint64_t LocalStackSize = L.FrameSize; + bool DoStackMalloc = + ClUseAfterReturn && LocalStackSize <= kMaxStackMallocSize; Type *ByteArrayTy = ArrayType::get(IRB.getInt8Ty(), LocalStackSize); AllocaInst *MyAlloca = new AllocaInst(ByteArrayTy, "MyAlloca", InsBefore); - if (ClRealignStack && StackAlignment < RedzoneSize()) - StackAlignment = RedzoneSize(); - MyAlloca->setAlignment(StackAlignment); + MyAlloca->setDebugLoc(EntryDebugLocation); + assert((ClRealignStack & (ClRealignStack - 1)) == 0); + size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack); + MyAlloca->setAlignment(FrameAlignment); assert(MyAlloca->isStaticAlloca()); Value *OrigStackBase = IRB.CreatePointerCast(MyAlloca, IntptrTy); Value *LocalStackBase = OrigStackBase; @@ -1500,30 +1624,25 @@ void FunctionStackPoisoner::poisonStack() { kAsanOptionDetectUAR, IRB.getInt32Ty()); Value *Cmp = IRB.CreateICmpNE(IRB.CreateLoad(OptionDetectUAR), Constant::getNullValue(IRB.getInt32Ty())); - Instruction *Term = - SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false); + Instruction *Term = SplitBlockAndInsertIfThen(Cmp, InsBefore, false); BasicBlock *CmpBlock = cast<Instruction>(Cmp)->getParent(); IRBuilder<> IRBIf(Term); + IRBIf.SetCurrentDebugLocation(EntryDebugLocation); LocalStackBase = IRBIf.CreateCall2( AsanStackMallocFunc[StackMallocIdx], ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase); BasicBlock *SetBlock = cast<Instruction>(LocalStackBase)->getParent(); IRB.SetInsertPoint(InsBefore); + IRB.SetCurrentDebugLocation(EntryDebugLocation); PHINode *Phi = IRB.CreatePHI(IntptrTy, 2); Phi->addIncoming(OrigStackBase, CmpBlock); Phi->addIncoming(LocalStackBase, SetBlock); LocalStackBase = Phi; } - // This string will be parsed by the run-time (DescribeAddressIfStack). - SmallString<2048> StackDescriptionStorage; - raw_svector_ostream StackDescription(StackDescriptionStorage); - StackDescription << AllocaVec.size() << " "; - // Insert poison calls for lifetime intrinsics for alloca. bool HavePoisonedAllocas = false; - for (size_t i = 0, n = AllocaPoisonCallVec.size(); i < n; i++) { - const AllocaPoisonCall &APC = AllocaPoisonCallVec[i]; + for (const auto &APC : AllocaPoisonCallVec) { assert(APC.InsBefore); assert(APC.AI); IRBuilder<> IRB(APC.InsBefore); @@ -1531,24 +1650,15 @@ void FunctionStackPoisoner::poisonStack() { HavePoisonedAllocas |= APC.DoPoison; } - uint64_t Pos = RedzoneSize(); // Replace Alloca instructions with base+offset. - for (size_t i = 0, n = AllocaVec.size(); i < n; i++) { - AllocaInst *AI = AllocaVec[i]; - uint64_t SizeInBytes = getAllocaSizeInBytes(AI); - StringRef Name = AI->getName(); - StackDescription << Pos << " " << SizeInBytes << " " - << Name.size() << " " << Name << " "; - uint64_t AlignedSize = getAlignedAllocaSize(AI); - assert((AlignedSize % RedzoneSize()) == 0); + for (const auto &Desc : SVD) { + AllocaInst *AI = Desc.AI; Value *NewAllocaPtr = IRB.CreateIntToPtr( - IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Pos)), - AI->getType()); + IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)), + AI->getType()); replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB); AI->replaceAllUsesWith(NewAllocaPtr); - Pos += AlignedSize + RedzoneSize(); } - assert(Pos == LocalStackSize); // The left-most redzone has enough space for at least 4 pointers. // Write the Magic value to redzone[0]. @@ -1560,7 +1670,8 @@ void FunctionStackPoisoner::poisonStack() { IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, ASan.LongSize/8)), IntptrPtrTy); GlobalVariable *StackDescriptionGlobal = - createPrivateGlobalForString(*F.getParent(), StackDescription.str()); + createPrivateGlobalForString(*F.getParent(), L.DescriptionString, + /*AllowMerging*/true); Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy); IRB.CreateStore(Description, BasePlus1); @@ -1573,30 +1684,32 @@ void FunctionStackPoisoner::poisonStack() { // Poison the stack redzones at the entry. Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB); - poisonRedZones(AllocaVec, IRB, ShadowBase, true); + poisonRedZones(L.ShadowBytes, IRB, ShadowBase, true); - // Unpoison the stack before all ret instructions. - for (size_t i = 0, n = RetVec.size(); i < n; i++) { - Instruction *Ret = RetVec[i]; + // (Un)poison the stack before all ret instructions. + for (auto Ret : RetVec) { IRBuilder<> IRBRet(Ret); // Mark the current frame as retired. IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic), BasePlus0); - // Unpoison the stack. - poisonRedZones(AllocaVec, IRBRet, ShadowBase, false); if (DoStackMalloc) { assert(StackMallocIdx >= 0); - // In use-after-return mode, mark the whole stack frame unaddressable. + // if LocalStackBase != OrigStackBase: + // // In use-after-return mode, poison the whole stack frame. + // if StackMallocIdx <= 4 + // // For small sizes inline the whole thing: + // memset(ShadowBase, kAsanStackAfterReturnMagic, ShadowSize); + // **SavedFlagPtr(LocalStackBase) = 0 + // else + // __asan_stack_free_N(LocalStackBase, OrigStackBase) + // else + // <This is not a fake stack; unpoison the redzones> + Value *Cmp = IRBRet.CreateICmpNE(LocalStackBase, OrigStackBase); + TerminatorInst *ThenTerm, *ElseTerm; + SplitBlockAndInsertIfThenElse(Cmp, Ret, &ThenTerm, &ElseTerm); + + IRBuilder<> IRBPoison(ThenTerm); if (StackMallocIdx <= 4) { - // For small sizes inline the whole thing: - // if LocalStackBase != OrigStackBase: - // memset(ShadowBase, kAsanStackAfterReturnMagic, ShadowSize); - // **SavedFlagPtr(LocalStackBase) = 0 - // FIXME: if LocalStackBase != OrigStackBase don't call poisonRedZones. - Value *Cmp = IRBRet.CreateICmpNE(LocalStackBase, OrigStackBase); - TerminatorInst *PoisonTerm = - SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false); - IRBuilder<> IRBPoison(PoisonTerm); int ClassSize = kMinStackMallocSize << StackMallocIdx; SetShadowToStackAfterReturnInlined(IRBPoison, ShadowBase, ClassSize >> Mapping.Scale); @@ -1610,21 +1723,26 @@ void FunctionStackPoisoner::poisonStack() { IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getInt8PtrTy())); } else { // For larger frames call __asan_stack_free_*. - IRBRet.CreateCall3(AsanStackFreeFunc[StackMallocIdx], LocalStackBase, - ConstantInt::get(IntptrTy, LocalStackSize), - OrigStackBase); + IRBPoison.CreateCall3(AsanStackFreeFunc[StackMallocIdx], LocalStackBase, + ConstantInt::get(IntptrTy, LocalStackSize), + OrigStackBase); } + + IRBuilder<> IRBElse(ElseTerm); + poisonRedZones(L.ShadowBytes, IRBElse, ShadowBase, false); } else if (HavePoisonedAllocas) { // If we poisoned some allocas in llvm.lifetime analysis, // unpoison whole stack frame now. assert(LocalStackBase == OrigStackBase); poisonAlloca(LocalStackBase, LocalStackSize, IRBRet, false); + } else { + poisonRedZones(L.ShadowBytes, IRBRet, ShadowBase, false); } } // We are done. Remove the old unused alloca instructions. - for (size_t i = 0, n = AllocaVec.size(); i < n; i++) - AllocaVec[i]->eraseFromParent(); + for (auto AI : AllocaVec) + AI->eraseFromParent(); } void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size, @@ -1649,7 +1767,7 @@ void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size, AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) { if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) // We're intested only in allocas we can handle. - return isInterestingAlloca(*AI) ? AI : 0; + return isInterestingAlloca(*AI) ? AI : nullptr; // See if we've already calculated (or started to calculate) alloca for a // given value. AllocaForValueMapTy::iterator I = AllocaForValue.find(V); @@ -1657,8 +1775,8 @@ AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) { return I->second; // Store 0 while we're calculating alloca for value V to avoid // infinite recursion if the value references itself. - AllocaForValue[V] = 0; - AllocaInst *Res = 0; + AllocaForValue[V] = nullptr; + AllocaInst *Res = nullptr; if (CastInst *CI = dyn_cast<CastInst>(V)) Res = findAllocaForValue(CI->getOperand(0)); else if (PHINode *PN = dyn_cast<PHINode>(V)) { @@ -1668,12 +1786,12 @@ AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) { if (IncValue == PN) continue; AllocaInst *IncValueAI = findAllocaForValue(IncValue); // AI for incoming values should exist and should all be equal. - if (IncValueAI == 0 || (Res != 0 && IncValueAI != Res)) - return 0; + if (IncValueAI == nullptr || (Res != nullptr && IncValueAI != Res)) + return nullptr; Res = IncValueAI; } } - if (Res != 0) + if (Res) AllocaForValue[V] = Res; return Res; } diff --git a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp index 7a9f0f6..9a5cea8 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -12,22 +12,23 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "bounds-checking" #include "llvm/Transforms/Instrumentation.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetFolder.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/InstIterator.h" -#include "llvm/Support/TargetFolder.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLibraryInfo.h" using namespace llvm; +#define DEBUG_TYPE "bounds-checking" + static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap", cl::desc("Use one trap block per function")); @@ -45,15 +46,15 @@ namespace { initializeBoundsCheckingPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DataLayout>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DataLayoutPass>(); AU.addRequired<TargetLibraryInfo>(); } private: - const DataLayout *TD; + const DataLayout *DL; const TargetLibraryInfo *TLI; ObjectSizeOffsetEvaluator *ObjSizeEval; BuilderTy *Builder; @@ -61,9 +62,7 @@ namespace { BasicBlock *TrapBB; BasicBlock *getTrapBB(); - void emitBranchToTrap(Value *Cmp = 0); - bool computeAllocSize(Value *Ptr, APInt &Offset, Value* &OffsetValue, - APInt &Size, Value* &SizeValue); + void emitBranchToTrap(Value *Cmp = nullptr); bool instrument(Value *Ptr, Value *Val); }; } @@ -105,7 +104,7 @@ void BoundsChecking::emitBranchToTrap(Value *Cmp) { if (!C->getZExtValue()) return; else - Cmp = 0; // unconditional branch + Cmp = nullptr; // unconditional branch } ++ChecksAdded; @@ -127,7 +126,7 @@ void BoundsChecking::emitBranchToTrap(Value *Cmp) { /// size of memory block that is touched. /// Returns true if any change was made to the IR, false otherwise. bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) { - uint64_t NeededSize = TD->getTypeStoreSize(InstVal->getType()); + uint64_t NeededSize = DL->getTypeStoreSize(InstVal->getType()); DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize) << " bytes\n"); @@ -142,7 +141,7 @@ bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) { Value *Offset = SizeOffset.second; ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size); - Type *IntTy = TD->getIntPtrType(Ptr->getType()); + Type *IntTy = DL->getIntPtrType(Ptr->getType()); Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize); // three checks are required to ensure safety: @@ -166,13 +165,13 @@ bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) { } bool BoundsChecking::runOnFunction(Function &F) { - TD = &getAnalysis<DataLayout>(); + DL = &getAnalysis<DataLayoutPass>().getDataLayout(); TLI = &getAnalysis<TargetLibraryInfo>(); - TrapBB = 0; - BuilderTy TheBuilder(F.getContext(), TargetFolder(TD)); + TrapBB = nullptr; + BuilderTy TheBuilder(F.getContext(), TargetFolder(DL)); Builder = &TheBuilder; - ObjectSizeOffsetEvaluator TheObjSizeEval(TD, TLI, F.getContext(), + ObjectSizeOffsetEvaluator TheObjSizeEval(DL, TLI, F.getContext(), /*RoundToAlign=*/true); ObjSizeEval = &TheObjSizeEval; diff --git a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 9b9e725..35057cd 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -50,19 +50,23 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstVisitor.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" -#include "llvm/InstVisitor.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/SpecialCaseList.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/SpecialCaseList.h" +#include <algorithm> #include <iterator> +#include <set> +#include <utility> using namespace llvm; @@ -96,6 +100,22 @@ static cl::opt<bool> ClArgsABI( cl::desc("Use the argument ABI rather than the TLS ABI"), cl::Hidden); +// Controls whether the pass includes or ignores the labels of pointers in load +// instructions. +static cl::opt<bool> ClCombinePointerLabelsOnLoad( + "dfsan-combine-pointer-labels-on-load", + cl::desc("Combine the label of the pointer with the label of the data when " + "loading from memory."), + cl::Hidden, cl::init(true)); + +// Controls whether the pass includes or ignores the labels of pointers in +// stores instructions. +static cl::opt<bool> ClCombinePointerLabelsOnStore( + "dfsan-combine-pointer-labels-on-store", + cl::desc("Combine the label of the pointer with the label of the data when " + "storing in memory."), + cl::Hidden, cl::init(false)); + static cl::opt<bool> ClDebugNonzeroLabels( "dfsan-debug-nonzero-labels", cl::desc("Insert calls to __dfsan_nonzero_label on observing a parameter, " @@ -104,6 +124,51 @@ static cl::opt<bool> ClDebugNonzeroLabels( namespace { +StringRef GetGlobalTypeString(const GlobalValue &G) { + // Types of GlobalVariables are always pointer types. + Type *GType = G.getType()->getElementType(); + // For now we support blacklisting struct types only. + if (StructType *SGType = dyn_cast<StructType>(GType)) { + if (!SGType->isLiteral()) + return SGType->getName(); + } + return "<unknown type>"; +} + +class DFSanABIList { + std::unique_ptr<SpecialCaseList> SCL; + + public: + DFSanABIList(SpecialCaseList *SCL) : SCL(SCL) {} + + /// Returns whether either this function or its source file are listed in the + /// given category. + bool isIn(const Function &F, const StringRef Category) const { + return isIn(*F.getParent(), Category) || + SCL->inSection("fun", F.getName(), Category); + } + + /// Returns whether this global alias is listed in the given category. + /// + /// If GA aliases a function, the alias's name is matched as a function name + /// would be. Similarly, aliases of globals are matched like globals. + bool isIn(const GlobalAlias &GA, const StringRef Category) const { + if (isIn(*GA.getParent(), Category)) + return true; + + if (isa<FunctionType>(GA.getType()->getElementType())) + return SCL->inSection("fun", GA.getName(), Category); + + return SCL->inSection("global", GA.getName(), Category) || + SCL->inSection("type", GetGlobalTypeString(GA), Category); + } + + /// Returns whether this module is listed in the given category. + bool isIn(const Module &M, const StringRef Category) const { + return SCL->inSection("src", M.getModuleIdentifier(), Category); + } +}; + class DataFlowSanitizer : public ModulePass { friend struct DFSanFunction; friend class DFSanVisitor; @@ -148,7 +213,7 @@ class DataFlowSanitizer : public ModulePass { WK_Custom }; - DataLayout *DL; + const DataLayout *DL; Module *Mod; LLVMContext *Ctx; IntegerType *ShadowTy; @@ -174,12 +239,11 @@ class DataFlowSanitizer : public ModulePass { Constant *DFSanSetLabelFn; Constant *DFSanNonzeroLabelFn; MDNode *ColdCallWeights; - OwningPtr<SpecialCaseList> ABIList; + DFSanABIList ABIList; DenseMap<Value *, Function *> UnwrappedFnMap; AttributeSet ReadOnlyNoneAttrs; Value *getShadowAddress(Value *Addr, Instruction *Pos); - Value *combineShadows(Value *V1, Value *V2, Instruction *Pos); bool isInstrumented(const Function *F); bool isInstrumented(const GlobalAlias *GA); FunctionType *getArgsFunctionType(FunctionType *T); @@ -195,15 +259,17 @@ class DataFlowSanitizer : public ModulePass { public: DataFlowSanitizer(StringRef ABIListFile = StringRef(), - void *(*getArgTLS)() = 0, void *(*getRetValTLS)() = 0); + void *(*getArgTLS)() = nullptr, + void *(*getRetValTLS)() = nullptr); static char ID; - bool doInitialization(Module &M); - bool runOnModule(Module &M); + bool doInitialization(Module &M) override; + bool runOnModule(Module &M) override; }; struct DFSanFunction { DataFlowSanitizer &DFS; Function *F; + DominatorTree DT; DataFlowSanitizer::InstrumentedABI IA; bool IsNativeABI; Value *ArgTLSPtr; @@ -215,15 +281,26 @@ struct DFSanFunction { DenseSet<Instruction *> SkipInsts; DenseSet<Value *> NonZeroChecks; + struct CachedCombinedShadow { + BasicBlock *Block; + Value *Shadow; + }; + DenseMap<std::pair<Value *, Value *>, CachedCombinedShadow> + CachedCombinedShadows; + DenseMap<Value *, std::set<Value *>> ShadowElements; + DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI) : DFS(DFS), F(F), IA(DFS.getInstrumentedABI()), - IsNativeABI(IsNativeABI), ArgTLSPtr(0), RetvalTLSPtr(0), - LabelReturnAlloca(0) {} + IsNativeABI(IsNativeABI), ArgTLSPtr(nullptr), RetvalTLSPtr(nullptr), + LabelReturnAlloca(nullptr) { + DT.recalculate(*F); + } Value *getArgTLSPtr(); Value *getArgTLS(unsigned Index, Instruction *Pos); Value *getRetvalTLS(); Value *getShadow(Value *V); void setShadow(Instruction *I, Value *Shadow); + Value *combineShadows(Value *V1, Value *V2, Instruction *Pos); Value *combineOperandShadows(Instruction *Inst); Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align, Instruction *Pos); @@ -287,7 +364,7 @@ FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) { ArgTypes.push_back(ShadowPtrTy); Type *RetType = T->getReturnType(); if (!RetType->isVoidTy()) - RetType = StructType::get(RetType, ShadowTy, (Type *)0); + RetType = StructType::get(RetType, ShadowTy, (Type *)nullptr); return FunctionType::get(RetType, ArgTypes, T->isVarArg()); } @@ -327,9 +404,10 @@ FunctionType *DataFlowSanitizer::getCustomFunctionType(FunctionType *T) { } bool DataFlowSanitizer::doInitialization(Module &M) { - DL = getAnalysisIfAvailable<DataLayout>(); - if (!DL) - return false; + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + if (!DLP) + report_fatal_error("data layout missing"); + DL = &DLP->getDataLayout(); Mod = &M; Ctx = &M.getContext(); @@ -356,18 +434,20 @@ bool DataFlowSanitizer::doInitialization(Module &M) { if (GetArgTLSPtr) { Type *ArgTLSTy = ArrayType::get(ShadowTy, 64); - ArgTLS = 0; + ArgTLS = nullptr; GetArgTLS = ConstantExpr::getIntToPtr( ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)), PointerType::getUnqual( - FunctionType::get(PointerType::getUnqual(ArgTLSTy), (Type *)0))); + FunctionType::get(PointerType::getUnqual(ArgTLSTy), + (Type *)nullptr))); } if (GetRetvalTLSPtr) { - RetvalTLS = 0; + RetvalTLS = nullptr; GetRetvalTLS = ConstantExpr::getIntToPtr( ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)), PointerType::getUnqual( - FunctionType::get(PointerType::getUnqual(ShadowTy), (Type *)0))); + FunctionType::get(PointerType::getUnqual(ShadowTy), + (Type *)nullptr))); } ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000); @@ -375,11 +455,11 @@ bool DataFlowSanitizer::doInitialization(Module &M) { } bool DataFlowSanitizer::isInstrumented(const Function *F) { - return !ABIList->isIn(*F, "uninstrumented"); + return !ABIList.isIn(*F, "uninstrumented"); } bool DataFlowSanitizer::isInstrumented(const GlobalAlias *GA) { - return !ABIList->isIn(*GA, "uninstrumented"); + return !ABIList.isIn(*GA, "uninstrumented"); } DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() { @@ -387,11 +467,11 @@ DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() { } DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) { - if (ABIList->isIn(*F, "functional")) + if (ABIList.isIn(*F, "functional")) return WK_Functional; - if (ABIList->isIn(*F, "discard")) + if (ABIList.isIn(*F, "discard")) return WK_Discard; - if (ABIList->isIn(*F, "custom")) + if (ABIList.isIn(*F, "custom")) return WK_Custom; return WK_Warning; @@ -480,7 +560,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) { if (!DL) return false; - if (ABIList->isIn(M, "skip")) + if (ABIList.isIn(M, "skip")) return false; if (!GetArgTLSPtr) { @@ -505,6 +585,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) { DFSanUnionLoadFn = Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy); if (Function *F = dyn_cast<Function>(DFSanUnionLoadFn)) { + F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadOnly); F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); } DFSanUnimplementedFn = @@ -536,8 +617,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) { ++i; // Don't stop on weak. We assume people aren't playing games with the // instrumentedness of overridden weak aliases. - if (Function *F = dyn_cast<Function>( - GA->resolveAliasedGlobal(/*stopOnWeak=*/false))) { + if (auto F = dyn_cast<Function>(GA->getBaseObject())) { bool GAInst = isInstrumented(GA), FInst = isInstrumented(F); if (GAInst && FInst) { addGlobalNamePrefix(GA); @@ -547,7 +627,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) { // below will take care of instrumenting it. Function *NewF = buildWrapperFunction(F, "", GA->getLinkage(), F->getFunctionType()); - GA->replaceAllUsesWith(NewF); + GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewF, GA->getType())); NewF->takeName(GA); GA->eraseFromParent(); FnsToInstrument.push_back(NewF); @@ -589,10 +669,10 @@ bool DataFlowSanitizer::runOnModule(Module &M) { } NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList()); - for (Function::use_iterator ui = F.use_begin(), ue = F.use_end(); - ui != ue;) { - BlockAddress *BA = dyn_cast<BlockAddress>(ui.getUse().getUser()); - ++ui; + for (Function::user_iterator UI = F.user_begin(), UE = F.user_end(); + UI != UE;) { + BlockAddress *BA = dyn_cast<BlockAddress>(*UI); + ++UI; if (BA) { BA->replaceAllUsesWith( BlockAddress::get(NewF, BA->getBasicBlock())); @@ -612,7 +692,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) { // function... yet. } else if (FT->isVarArg()) { UnwrappedFnMap[&F] = &F; - *i = 0; + *i = nullptr; } else if (!IsZeroArgsVoidRet || getWrapperKind(&F) == WK_Custom) { // Build a wrapper function for F. The wrapper simply calls F, and is // added to FnsToInstrument so that any instrumentation according to its @@ -663,9 +743,8 @@ bool DataFlowSanitizer::runOnModule(Module &M) { // DFSanVisitor may create new basic blocks, which confuses df_iterator. // Build a copy of the list before iterating over it. - llvm::SmallVector<BasicBlock *, 4> BBList; - std::copy(df_begin(&(*i)->getEntryBlock()), df_end(&(*i)->getEntryBlock()), - std::back_inserter(BBList)); + llvm::SmallVector<BasicBlock *, 4> BBList( + depth_first(&(*i)->getEntryBlock())); for (llvm::SmallVector<BasicBlock *, 4>::iterator i = BBList.begin(), e = BBList.end(); @@ -718,10 +797,9 @@ bool DataFlowSanitizer::runOnModule(Module &M) { while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos)) Pos = Pos->getNextNode(); IRBuilder<> IRB(Pos); - Instruction *NeInst = cast<Instruction>( - IRB.CreateICmpNE(*i, DFSF.DFS.ZeroShadow)); + Value *Ne = IRB.CreateICmpNE(*i, DFSF.DFS.ZeroShadow); BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen( - NeInst, /*Unreachable=*/ false, ColdCallWeights)); + Ne, Pos, /*Unreachable=*/false, ColdCallWeights)); IRBuilder<> ThenIRB(BI); ThenIRB.CreateCall(DFSF.DFS.DFSanNonzeroLabelFn); } @@ -810,37 +888,72 @@ Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) { // Generates IR to compute the union of the two given shadows, inserting it // before Pos. Returns the computed union Value. -Value *DataFlowSanitizer::combineShadows(Value *V1, Value *V2, - Instruction *Pos) { - if (V1 == ZeroShadow) +Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) { + if (V1 == DFS.ZeroShadow) return V2; - if (V2 == ZeroShadow) + if (V2 == DFS.ZeroShadow) return V1; if (V1 == V2) return V1; + + auto V1Elems = ShadowElements.find(V1); + auto V2Elems = ShadowElements.find(V2); + if (V1Elems != ShadowElements.end() && V2Elems != ShadowElements.end()) { + if (std::includes(V1Elems->second.begin(), V1Elems->second.end(), + V2Elems->second.begin(), V2Elems->second.end())) { + return V1; + } else if (std::includes(V2Elems->second.begin(), V2Elems->second.end(), + V1Elems->second.begin(), V1Elems->second.end())) { + return V2; + } + } else if (V1Elems != ShadowElements.end()) { + if (V1Elems->second.count(V2)) + return V1; + } else if (V2Elems != ShadowElements.end()) { + if (V2Elems->second.count(V1)) + return V2; + } + + auto Key = std::make_pair(V1, V2); + if (V1 > V2) + std::swap(Key.first, Key.second); + CachedCombinedShadow &CCS = CachedCombinedShadows[Key]; + if (CCS.Block && DT.dominates(CCS.Block, Pos->getParent())) + return CCS.Shadow; + IRBuilder<> IRB(Pos); BasicBlock *Head = Pos->getParent(); Value *Ne = IRB.CreateICmpNE(V1, V2); - Instruction *NeInst = dyn_cast<Instruction>(Ne); - if (NeInst) { - BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen( - NeInst, /*Unreachable=*/ false, ColdCallWeights)); - IRBuilder<> ThenIRB(BI); - CallInst *Call = ThenIRB.CreateCall2(DFSanUnionFn, V1, V2); - Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); - Call->addAttribute(1, Attribute::ZExt); - Call->addAttribute(2, Attribute::ZExt); - - BasicBlock *Tail = BI->getSuccessor(0); - PHINode *Phi = PHINode::Create(ShadowTy, 2, "", Tail->begin()); - Phi->addIncoming(Call, Call->getParent()); - Phi->addIncoming(V1, Head); - Pos = Phi; - return Phi; + BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen( + Ne, Pos, /*Unreachable=*/false, DFS.ColdCallWeights, &DT)); + IRBuilder<> ThenIRB(BI); + CallInst *Call = ThenIRB.CreateCall2(DFS.DFSanUnionFn, V1, V2); + Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); + Call->addAttribute(1, Attribute::ZExt); + Call->addAttribute(2, Attribute::ZExt); + + BasicBlock *Tail = BI->getSuccessor(0); + PHINode *Phi = PHINode::Create(DFS.ShadowTy, 2, "", Tail->begin()); + Phi->addIncoming(Call, Call->getParent()); + Phi->addIncoming(V1, Head); + + CCS.Block = Tail; + CCS.Shadow = Phi; + + std::set<Value *> UnionElems; + if (V1Elems != ShadowElements.end()) { + UnionElems = V1Elems->second; + } else { + UnionElems.insert(V1); + } + if (V2Elems != ShadowElements.end()) { + UnionElems.insert(V2Elems->second.begin(), V2Elems->second.end()); } else { - assert(0 && "todo"); - return 0; + UnionElems.insert(V2); } + ShadowElements[Phi] = std::move(UnionElems); + + return Phi; } // A convenience function which folds the shadows of each of the operands @@ -852,7 +965,7 @@ Value *DFSanFunction::combineOperandShadows(Instruction *Inst) { Value *Shadow = getShadow(Inst->getOperand(0)); for (unsigned i = 1, n = Inst->getNumOperands(); i != n; ++i) { - Shadow = DFS.combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst); + Shadow = combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst); } return Shadow; } @@ -905,9 +1018,8 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, IRBuilder<> IRB(Pos); Value *ShadowAddr1 = IRB.CreateGEP(ShadowAddr, ConstantInt::get(DFS.IntptrTy, 1)); - return DFS.combineShadows(IRB.CreateAlignedLoad(ShadowAddr, ShadowAlign), - IRB.CreateAlignedLoad(ShadowAddr1, ShadowAlign), - Pos); + return combineShadows(IRB.CreateAlignedLoad(ShadowAddr, ShadowAlign), + IRB.CreateAlignedLoad(ShadowAddr1, ShadowAlign), Pos); } } if (Size % (64 / DFS.ShadowWidth) == 0) { @@ -934,16 +1046,27 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, BasicBlock *Head = Pos->getParent(); BasicBlock *Tail = Head->splitBasicBlock(Pos); + + if (DomTreeNode *OldNode = DT.getNode(Head)) { + std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end()); + + DomTreeNode *NewNode = DT.addNewBlock(Tail, Head); + for (auto Child : Children) + DT.changeImmediateDominator(Child, NewNode); + } + // In the following code LastBr will refer to the previous basic block's // conditional branch instruction, whose true successor is fixed up to point // to the next block during the loop below or to the tail after the final // iteration. BranchInst *LastBr = BranchInst::Create(FallbackBB, FallbackBB, ShadowsEq); ReplaceInstWithInst(Head->getTerminator(), LastBr); + DT.addNewBlock(FallbackBB, Head); for (uint64_t Ofs = 64 / DFS.ShadowWidth; Ofs != Size; Ofs += 64 / DFS.ShadowWidth) { BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F); + DT.addNewBlock(NextBB, LastBr->getParent()); IRBuilder<> NextIRB(NextBB); WideAddr = NextIRB.CreateGEP(WideAddr, ConstantInt::get(DFS.IntptrTy, 1)); Value *NextWideShadow = NextIRB.CreateAlignedLoad(WideAddr, ShadowAlign); @@ -978,14 +1101,15 @@ void DFSanVisitor::visitLoadInst(LoadInst &LI) { Align = 1; } IRBuilder<> IRB(&LI); - Value *LoadedShadow = - DFSF.loadShadow(LI.getPointerOperand(), Size, Align, &LI); - Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand()); - Value *CombinedShadow = DFSF.DFS.combineShadows(LoadedShadow, PtrShadow, &LI); - if (CombinedShadow != DFSF.DFS.ZeroShadow) - DFSF.NonZeroChecks.insert(CombinedShadow); - - DFSF.setShadow(&LI, CombinedShadow); + Value *Shadow = DFSF.loadShadow(LI.getPointerOperand(), Size, Align, &LI); + if (ClCombinePointerLabelsOnLoad) { + Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand()); + Shadow = DFSF.combineShadows(Shadow, PtrShadow, &LI); + } + if (Shadow != DFSF.DFS.ZeroShadow) + DFSF.NonZeroChecks.insert(Shadow); + + DFSF.setShadow(&LI, Shadow); } void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, uint64_t Align, @@ -1050,8 +1174,13 @@ void DFSanVisitor::visitStoreInst(StoreInst &SI) { } else { Align = 1; } - DFSF.storeShadow(SI.getPointerOperand(), Size, Align, - DFSF.getShadow(SI.getValueOperand()), &SI); + + Value* Shadow = DFSF.getShadow(SI.getValueOperand()); + if (ClCombinePointerLabelsOnStore) { + Value *PtrShadow = DFSF.getShadow(SI.getPointerOperand()); + Shadow = DFSF.combineShadows(Shadow, PtrShadow, &SI); + } + DFSF.storeShadow(SI.getPointerOperand(), Size, Align, Shadow, &SI); } void DFSanVisitor::visitBinaryOperator(BinaryOperator &BO) { @@ -1088,12 +1217,11 @@ void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) { void DFSanVisitor::visitAllocaInst(AllocaInst &I) { bool AllLoadsStores = true; - for (Instruction::use_iterator i = I.use_begin(), e = I.use_end(); i != e; - ++i) { - if (isa<LoadInst>(*i)) + for (User *U : I.users()) { + if (isa<LoadInst>(U)) continue; - if (StoreInst *SI = dyn_cast<StoreInst>(*i)) { + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { if (SI->getPointerOperand() == &I) continue; } @@ -1115,9 +1243,9 @@ void DFSanVisitor::visitSelectInst(SelectInst &I) { if (isa<VectorType>(I.getCondition()->getType())) { DFSF.setShadow( - &I, DFSF.DFS.combineShadows( - CondShadow, - DFSF.DFS.combineShadows(TrueShadow, FalseShadow, &I), &I)); + &I, + DFSF.combineShadows( + CondShadow, DFSF.combineShadows(TrueShadow, FalseShadow, &I), &I)); } else { Value *ShadowSel; if (TrueShadow == FalseShadow) { @@ -1126,7 +1254,7 @@ void DFSanVisitor::visitSelectInst(SelectInst &I) { ShadowSel = SelectInst::Create(I.getCondition(), TrueShadow, FalseShadow, "", &I); } - DFSF.setShadow(&I, DFSF.DFS.combineShadows(CondShadow, ShadowSel, &I)); + DFSF.setShadow(&I, DFSF.combineShadows(CondShadow, ShadowSel, &I)); } } @@ -1299,7 +1427,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) { } } - Instruction *Next = 0; + Instruction *Next = nullptr; if (!CS.getType()->isVoidTy()) { if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) { if (II->getNormalDest()->getSinglePredecessor()) { diff --git a/contrib/llvm/lib/Transforms/Instrumentation/DebugIR.cpp b/contrib/llvm/lib/Transforms/Instrumentation/DebugIR.cpp index f50a044..f2f1738 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/DebugIR.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/DebugIR.cpp @@ -16,27 +16,23 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "debug-ir" - -#include "llvm/ADT/ValueMap.h" -#include "llvm/Assembly/AssemblyAnnotationWriter.h" -#include "llvm/DebugInfo.h" -#include "llvm/DIBuilder.h" -#include "llvm/InstVisitor.h" +#include "llvm/IR/ValueMap.h" +#include "DebugIR.h" +#include "llvm/IR/AssemblyAnnotationWriter.h" +#include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" -#include "llvm/Transforms/Instrumentation.h" -#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ToolOutputFile.h" -#include "llvm/Support/FormattedStream.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/FormattedStream.h" #include "llvm/Support/Path.h" - -#include "DebugIR.h" - +#include "llvm/Support/ToolOutputFile.h" +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/Utils/Cloning.h" #include <string> #define STR_HELPER(x) #x @@ -44,6 +40,8 @@ using namespace llvm; +#define DEBUG_TYPE "debug-ir" + namespace { /// Builds a map of Value* to line numbers on which the Value appears in a @@ -69,11 +67,12 @@ public: // This function is called after an Instruction, GlobalValue, or GlobalAlias // is printed. - void printInfoComment(const Value &V, formatted_raw_ostream &Out) { + void printInfoComment(const Value &V, formatted_raw_ostream &Out) override { addEntry(&V, Out); } - void emitFunctionAnnot(const Function *F, formatted_raw_ostream &Out) { + void emitFunctionAnnot(const Function *F, + formatted_raw_ostream &Out) override { addEntry(F, Out); } @@ -119,7 +118,7 @@ public: void visitInstruction(Instruction &I) { if (I.getMetadata(LLVMContext::MD_dbg)) - I.setMetadata(LLVMContext::MD_dbg, 0); + I.setMetadata(LLVMContext::MD_dbg, nullptr); } void run(Module *M) { @@ -169,11 +168,11 @@ class DIUpdater : public InstVisitor<DIUpdater> { public: DIUpdater(Module &M, StringRef Filename = StringRef(), - StringRef Directory = StringRef(), const Module *DisplayM = 0, - const ValueToValueMapTy *VMap = 0) + StringRef Directory = StringRef(), const Module *DisplayM = nullptr, + const ValueToValueMapTy *VMap = nullptr) : Builder(M), Layout(&M), LineTable(DisplayM ? DisplayM : &M), VMap(VMap), - Finder(), Filename(Filename), Directory(Directory), FileNode(0), - LexicalBlockFileNode(0), CUNode(0) { + Finder(), Filename(Filename), Directory(Directory), FileNode(nullptr), + LexicalBlockFileNode(nullptr), CUNode(nullptr) { Finder.processModule(M); visit(&M); } @@ -184,8 +183,8 @@ public: if (Finder.compile_unit_count() > 1) report_fatal_error("DebugIR pass supports only a signle compile unit per " "Module."); - createCompileUnit( - Finder.compile_unit_count() == 1 ? *Finder.compile_unit_begin() : 0); + createCompileUnit(Finder.compile_unit_count() == 1 ? + (MDNode*)*Finder.compile_units().begin() : nullptr); } void visitFunction(Function &F) { @@ -233,7 +232,7 @@ public: /// If a ValueToValueMap is provided, use it to get the real instruction as /// the line table was generated on a clone of the module on which we are /// operating. - Value *RealInst = 0; + Value *RealInst = nullptr; if (VMap) RealInst = VMap->lookup(&I); @@ -257,7 +256,7 @@ public: NewLoc = DebugLoc::get(Line, Col, Loc.getScope(RealInst->getContext()), Loc.getInlinedAt(RealInst->getContext())); else if (MDNode *scope = findScope(&I)) - NewLoc = DebugLoc::get(Line, Col, scope, 0); + NewLoc = DebugLoc::get(Line, Col, scope, nullptr); else { DEBUG(dbgs() << "WARNING: no valid scope for instruction " << &I << ". no DebugLoc will be present." @@ -326,19 +325,16 @@ private: << " subprogram nodes" << "\n"); - for (DebugInfoFinder::iterator i = Finder.subprogram_begin(), - e = Finder.subprogram_end(); - i != e; ++i) { - DISubprogram S(*i); + for (DISubprogram S : Finder.subprograms()) { if (S.getFunction() == F) { - DEBUG(dbgs() << "Found DISubprogram " << *i << " for function " + DEBUG(dbgs() << "Found DISubprogram " << S << " for function " << S.getFunction() << "\n"); - return *i; + return S; } } DEBUG(dbgs() << "unable to find DISubprogram node for function " << F->getName().str() << "\n"); - return 0; + return nullptr; } /// Sets Line to the line number on which V appears and returns true. If a @@ -358,7 +354,10 @@ private: std::string getTypeName(Type *T) { std::string TypeName; raw_string_ostream TypeStream(TypeName); - T->print(TypeStream); + if (T) + T->print(TypeStream); + else + TypeStream << "Printing <null> Type"; TypeStream.flush(); return TypeName; } @@ -370,7 +369,7 @@ private: TypeNodeIter i = TypeDescriptors.find(T); if (i != TypeDescriptors.end()) return i->second; - return 0; + return nullptr; } /// Returns a DebugInfo type from an LLVM type T. @@ -379,12 +378,12 @@ private: if (N) return DIDerivedType(N); else if (T->isVoidTy()) - return DIDerivedType(0); + return DIDerivedType(nullptr); else if (T->isStructTy()) { N = Builder.createStructType( DIScope(LexicalBlockFileNode), T->getStructName(), DIFile(FileNode), 0, Layout.getTypeSizeInBits(T), Layout.getABITypeAlignment(T), 0, - DIType(0), DIArray(0)); // filled in later + DIType(nullptr), DIArray(nullptr)); // filled in later // N is added to the map (early) so that element search below can find it, // so as to avoid infinite recursion for structs that contain pointers to @@ -504,7 +503,7 @@ bool DebugIR::updateExtension(StringRef NewExtension) { return true; } -void DebugIR::generateFilename(OwningPtr<int> &fd) { +void DebugIR::generateFilename(std::unique_ptr<int> &fd) { SmallVector<char, 16> PathVec; fd.reset(new int); sys::fs::createTemporaryFile("debug-ir", "ll", *fd, PathVec); @@ -525,12 +524,12 @@ std::string DebugIR::getPath() { } void DebugIR::writeDebugBitcode(const Module *M, int *fd) { - OwningPtr<raw_fd_ostream> Out; + std::unique_ptr<raw_fd_ostream> Out; std::string error; if (!fd) { std::string Path = getPath(); - Out.reset(new raw_fd_ostream(Path.c_str(), error)); + Out.reset(new raw_fd_ostream(Path.c_str(), error, sys::fs::F_Text)); DEBUG(dbgs() << "WRITING debug bitcode from Module " << M << " to file " << Path << "\n"); } else { @@ -539,16 +538,16 @@ void DebugIR::writeDebugBitcode(const Module *M, int *fd) { Out.reset(new raw_fd_ostream(*fd, true)); } - M->print(*Out, 0); + M->print(*Out, nullptr); Out->close(); } -void DebugIR::createDebugInfo(Module &M, OwningPtr<Module> &DisplayM) { +void DebugIR::createDebugInfo(Module &M, std::unique_ptr<Module> &DisplayM) { if (M.getFunctionList().size() == 0) // no functions -- no debug info needed return; - OwningPtr<ValueToValueMapTy> VMap; + std::unique_ptr<ValueToValueMapTy> VMap; if (WriteSourceToDisk && (HideDebugIntrinsics || HideDebugMetadata)) { VMap.reset(new ValueToValueMapTy); @@ -567,7 +566,7 @@ void DebugIR::createDebugInfo(Module &M, OwningPtr<Module> &DisplayM) { bool DebugIR::isMissingPath() { return Filename.empty() || Directory.empty(); } bool DebugIR::runOnModule(Module &M) { - OwningPtr<int> fd; + std::unique_ptr<int> fd; if (isMissingPath() && !getSourceInfo(M)) { if (!WriteSourceToDisk) @@ -586,7 +585,7 @@ bool DebugIR::runOnModule(Module &M) { // file name from the DICompileUnit descriptor. DebugMetadataRemover::process(M, !ParsedPath); - OwningPtr<Module> DisplayM; + std::unique_ptr<Module> DisplayM; createDebugInfo(M, DisplayM); if (WriteSourceToDisk) { Module *OutputM = DisplayM.get() ? DisplayM.get() : &M; diff --git a/contrib/llvm/lib/Transforms/Instrumentation/DebugIR.h b/contrib/llvm/lib/Transforms/Instrumentation/DebugIR.h index 13774cf..02831ed 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/DebugIR.h +++ b/contrib/llvm/lib/Transforms/Instrumentation/DebugIR.h @@ -16,7 +16,6 @@ #ifndef LLVM_TRANSFORMS_INSTRUMENTATION_DEBUGIR_H #define LLVM_TRANSFORMS_INSTRUMENTATION_DEBUGIR_H -#include "llvm/ADT/OwningPtr.h" #include "llvm/Pass.h" namespace llvm { @@ -43,7 +42,7 @@ class DebugIR : public llvm::ModulePass { public: static char ID; - const char *getPassName() const { return "DebugIR"; } + const char *getPassName() const override { return "DebugIR"; } /// Generate a file on disk to be displayed in a debugger. If Filename and /// Directory are empty, a temporary path will be generated. @@ -62,7 +61,7 @@ public: /// Run pass on M and set Path to the source file path in the output module. bool runOnModule(llvm::Module &M, std::string &Path); - bool runOnModule(llvm::Module &M); + bool runOnModule(llvm::Module &M) override; private: @@ -79,11 +78,11 @@ private: bool updateExtension(llvm::StringRef NewExtension); /// Generate a temporary filename and open an fd - void generateFilename(llvm::OwningPtr<int> &fd); + void generateFilename(std::unique_ptr<int> &fd); /// Creates DWARF CU/Subroutine metadata void createDebugInfo(llvm::Module &M, - llvm::OwningPtr<llvm::Module> &DisplayM); + std::unique_ptr<llvm::Module> &DisplayM); /// Returns true if either Directory or Filename is missing, false otherwise. bool isMissingPath(); @@ -91,7 +90,7 @@ private: /// Write M to disk, optionally passing in an fd to an open file which is /// closed by this function after writing. If no fd is specified, a new file /// is opened, written, and closed. - void writeDebugBitcode(const llvm::Module *M, int *fd = 0); + void writeDebugBitcode(const llvm::Module *M, int *fd = nullptr); }; } // llvm namespace diff --git a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index 206bffb..cfeb62e 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -14,33 +14,36 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "insert-gcov-profiling" - #include "llvm/Transforms/Instrumentation.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Hashing.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/UniqueVector.h" -#include "llvm/DebugInfo.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/DebugLoc.h" #include "llvm/Support/FileSystem.h" -#include "llvm/Support/InstIterator.h" #include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include <algorithm> +#include <memory> #include <string> #include <utility> using namespace llvm; +#define DEBUG_TYPE "insert-gcov-profiling" + static cl::opt<std::string> DefaultGCOVVersion("default-gcov-version", cl::init("402*"), cl::Hidden, cl::ValueRequired); @@ -62,20 +65,25 @@ GCOVOptions GCOVOptions::getDefault() { } namespace { + class GCOVFunction; + class GCOVProfiler : public ModulePass { public: static char ID; GCOVProfiler() : ModulePass(ID), Options(GCOVOptions::getDefault()) { - ReversedVersion[0] = Options.Version[3]; - ReversedVersion[1] = Options.Version[2]; - ReversedVersion[2] = Options.Version[1]; - ReversedVersion[3] = Options.Version[0]; - ReversedVersion[4] = '\0'; - initializeGCOVProfilerPass(*PassRegistry::getPassRegistry()); + init(); } GCOVProfiler(const GCOVOptions &Options) : ModulePass(ID), Options(Options){ assert((Options.EmitNotes || Options.EmitData) && "GCOVProfiler asked to do nothing?"); + init(); + } + const char *getPassName() const override { + return "GCOV Profiler"; + } + + private: + void init() { ReversedVersion[0] = Options.Version[3]; ReversedVersion[1] = Options.Version[2]; ReversedVersion[2] = Options.Version[1]; @@ -83,12 +91,7 @@ namespace { ReversedVersion[4] = '\0'; initializeGCOVProfilerPass(*PassRegistry::getPassRegistry()); } - virtual const char *getPassName() const { - return "GCOV Profiler"; - } - - private: - bool runOnModule(Module &M); + bool runOnModule(Module &M) override; // Create the .gcno files for the Module based on DebugInfo. void emitProfileNotes(); @@ -130,10 +133,13 @@ namespace { GCOVOptions Options; // Reversed, NUL-terminated copy of Options.Version. - char ReversedVersion[5]; + char ReversedVersion[5]; + // Checksum, produced by hash of EdgeDestinations + SmallVector<uint32_t, 4> FileChecksums; Module *M; LLVMContext *Ctx; + SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs; }; } @@ -145,7 +151,7 @@ ModulePass *llvm::createGCOVProfilerPass(const GCOVOptions &Options) { return new GCOVProfiler(Options); } -static std::string getFunctionName(DISubprogram SP) { +static StringRef getFunctionName(DISubprogram SP) { if (!SP.getLinkageName().empty()) return SP.getLinkageName(); return SP.getName(); @@ -205,6 +211,7 @@ namespace { class GCOVLines : public GCOVRecord { public: void addLine(uint32_t Line) { + assert(Line != 0 && "Line zero is not a valid real line number."); Lines.push_back(Line); } @@ -220,7 +227,7 @@ namespace { write(Lines[i]); } - GCOVLines(StringRef F, raw_ostream *os) + GCOVLines(StringRef F, raw_ostream *os) : Filename(F) { this->os = os; } @@ -231,14 +238,6 @@ namespace { }; - // Sorting function for deterministic behaviour in GCOVBlock::writeOut. - struct StringKeySort { - bool operator()(StringMapEntry<GCOVLines *> *LHS, - StringMapEntry<GCOVLines *> *RHS) const { - return LHS->getKey() < RHS->getKey(); - } - }; - // Represent a basic block in GCOV. Each block has a unique number in the // function, number of lines belonging to each block, and a set of edges to // other blocks. @@ -269,11 +268,14 @@ namespace { write(Len); write(Number); - StringKeySort Sorter; - std::sort(SortedLinesByFile.begin(), SortedLinesByFile.end(), Sorter); + std::sort(SortedLinesByFile.begin(), SortedLinesByFile.end(), + [](StringMapEntry<GCOVLines *> *LHS, + StringMapEntry<GCOVLines *> *RHS) { + return LHS->getKey() < RHS->getKey(); + }); for (SmallVectorImpl<StringMapEntry<GCOVLines *> *>::iterator I = SortedLinesByFile.begin(), E = SortedLinesByFile.end(); - I != E; ++I) + I != E; ++I) (*I)->getValue()->writeOut(); write(0); write(0); @@ -302,30 +304,23 @@ namespace { class GCOVFunction : public GCOVRecord { public: GCOVFunction(DISubprogram SP, raw_ostream *os, uint32_t Ident, - bool UseCfgChecksum) { + bool UseCfgChecksum) : + SP(SP), Ident(Ident), UseCfgChecksum(UseCfgChecksum), CfgChecksum(0) { this->os = os; Function *F = SP.getFunction(); - DEBUG(dbgs() << "Function: " << F->getName() << "\n"); + DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n"); uint32_t i = 0; for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { Blocks[BB] = new GCOVBlock(i++, os); } ReturnBlock = new GCOVBlock(i++, os); - writeBytes(FunctionTag, 4); - uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(getFunctionName(SP)) + - 1 + lengthOfGCOVString(SP.getFilename()) + 1; - if (UseCfgChecksum) - ++BlockLen; - write(BlockLen); - write(Ident); - write(0); // lineno checksum - if (UseCfgChecksum) - write(0); // cfg checksum - writeGCOVString(getFunctionName(SP)); - writeGCOVString(SP.getFilename()); - write(SP.getLineNumber()); + std::string FunctionNameAndLine; + raw_string_ostream FNLOS(FunctionNameAndLine); + FNLOS << getFunctionName(SP) << SP.getLineNumber(); + FNLOS.flush(); + FuncChecksum = hash_value(FunctionNameAndLine); } ~GCOVFunction() { @@ -341,7 +336,41 @@ namespace { return *ReturnBlock; } + std::string getEdgeDestinations() { + std::string EdgeDestinations; + raw_string_ostream EDOS(EdgeDestinations); + Function *F = Blocks.begin()->first->getParent(); + for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { + GCOVBlock &Block = *Blocks[I]; + for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) + EDOS << Block.OutEdges[i]->Number; + } + return EdgeDestinations; + } + + uint32_t getFuncChecksum() { + return FuncChecksum; + } + + void setCfgChecksum(uint32_t Checksum) { + CfgChecksum = Checksum; + } + void writeOut() { + writeBytes(FunctionTag, 4); + uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(getFunctionName(SP)) + + 1 + lengthOfGCOVString(SP.getFilename()) + 1; + if (UseCfgChecksum) + ++BlockLen; + write(BlockLen); + write(Ident); + write(FuncChecksum); + if (UseCfgChecksum) + write(CfgChecksum); + writeGCOVString(getFunctionName(SP)); + writeGCOVString(SP.getFilename()); + write(SP.getLineNumber()); + // Emit count of blocks. writeBytes(BlockTag, 4); write(Blocks.size() + 1); @@ -375,6 +404,11 @@ namespace { } private: + DISubprogram SP; + uint32_t Ident; + uint32_t FuncChecksum; + bool UseCfgChecksum; + uint32_t CfgChecksum; DenseMap<BasicBlock *, GCOVBlock *> Blocks; GCOVBlock *ReturnBlock; }; @@ -414,6 +448,28 @@ bool GCOVProfiler::runOnModule(Module &M) { return false; } +static bool functionHasLines(Function *F) { + // Check whether this function actually has any source lines. Not only + // do these waste space, they also can crash gcov. + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { + for (BasicBlock::iterator I = BB->begin(), IE = BB->end(); + I != IE; ++I) { + // Debug intrinsic locations correspond to the location of the + // declaration, not necessarily any statements or expressions. + if (isa<DbgInfoIntrinsic>(I)) continue; + + const DebugLoc &Loc = I->getDebugLoc(); + if (Loc.isUnknown()) continue; + + // Artificial lines such as calls to the global constructors. + if (Loc.getLine() == 0) continue; + + return true; + } + } + return false; +} + void GCOVProfiler::emitProfileNotes() { NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu"); if (!CU_Nodes) return; @@ -426,10 +482,8 @@ void GCOVProfiler::emitProfileNotes() { DICompileUnit CU(CU_Nodes->getOperand(i)); std::string ErrorInfo; raw_fd_ostream out(mangleName(CU, "gcno").c_str(), ErrorInfo, - sys::fs::F_Binary); - out.write("oncg", 4); - out.write(ReversedVersion, 4); - out.write("MVLL", 4); + sys::fs::F_None); + std::string EdgeDestinations; DIArray SPs = CU.getSubprograms(); for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) { @@ -441,7 +495,19 @@ void GCOVProfiler::emitProfileNotes() { Function *F = SP.getFunction(); if (!F) continue; - GCOVFunction Func(SP, &out, i, Options.UseCfgChecksum); + if (!functionHasLines(F)) continue; + + // gcov expects every function to start with an entry block that has a + // single successor, so split the entry block to make sure of that. + BasicBlock &EntryBlock = F->getEntryBlock(); + BasicBlock::iterator It = EntryBlock.begin(); + while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It)) + ++It; + EntryBlock.splitBasicBlock(It); + + Funcs.push_back( + make_unique<GCOVFunction>(SP, &out, i, Options.UseCfgChecksum)); + GCOVFunction &Func = *Funcs.back(); for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { GCOVBlock &Block = Func.getBlock(BB); @@ -457,8 +523,16 @@ void GCOVProfiler::emitProfileNotes() { uint32_t Line = 0; for (BasicBlock::iterator I = BB->begin(), IE = BB->end(); I != IE; ++I) { + // Debug intrinsic locations correspond to the location of the + // declaration, not necessarily any statements or expressions. + if (isa<DbgInfoIntrinsic>(I)) continue; + const DebugLoc &Loc = I->getDebugLoc(); if (Loc.isUnknown()) continue; + + // Artificial lines such as calls to the global constructors. + if (Loc.getLine() == 0) continue; + if (Line == Loc.getLine()) continue; Line = Loc.getLine(); if (SP != getDISubprogram(Loc.getScope(*Ctx))) continue; @@ -467,8 +541,19 @@ void GCOVProfiler::emitProfileNotes() { Lines.addLine(Loc.getLine()); } } - Func.writeOut(); + EdgeDestinations += Func.getEdgeDestinations(); } + + FileChecksums.push_back(hash_value(EdgeDestinations)); + out.write("oncg", 4); + out.write(ReversedVersion, 4); + out.write(reinterpret_cast<char*>(&FileChecksums.back()), 4); + + for (auto &Func : Funcs) { + Func->setCfgChecksum(FileChecksums.back()); + Func->writeOut(); + } + out.write("\0\0\0\0\0\0\0\0", 8); // EOF out.close(); } @@ -478,7 +563,7 @@ bool GCOVProfiler::emitProfileArcs() { NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu"); if (!CU_Nodes) return false; - bool Result = false; + bool Result = false; bool InsertIndCounterIncrCode = false; for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) { DICompileUnit CU(CU_Nodes->getOperand(i)); @@ -492,6 +577,7 @@ bool GCOVProfiler::emitProfileArcs() { continue; Function *F = SP.getFunction(); if (!F) continue; + if (!functionHasLines(F)) continue; if (!Result) Result = true; unsigned Edges = 0; for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { @@ -501,7 +587,7 @@ bool GCOVProfiler::emitProfileArcs() { else Edges += TI->getNumSuccessors(); } - + ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(*Ctx), Edges); GlobalVariable *Counters = @@ -510,10 +596,10 @@ bool GCOVProfiler::emitProfileArcs() { Constant::getNullValue(CounterTy), "__llvm_gcov_ctr"); CountersBySP.push_back(std::make_pair(Counters, (MDNode*)SP)); - + UniqueVector<BasicBlock *> ComplexEdgePreds; UniqueVector<BasicBlock *> ComplexEdgeSuccs; - + unsigned Edge = 0; for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { TerminatorInst *TI = BB->getTerminator(); @@ -547,13 +633,13 @@ bool GCOVProfiler::emitProfileArcs() { Edge += Successors; } } - + if (!ComplexEdgePreds.empty()) { GlobalVariable *EdgeTable = buildEdgeLookupTable(F, Counters, ComplexEdgePreds, ComplexEdgeSuccs); GlobalVariable *EdgeState = getEdgeStateValue(); - + for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) { IRBuilder<> Builder(ComplexEdgePreds[i + 1]->getFirstInsertionPt()); Builder.CreateStore(Builder.getInt32(i), EdgeState); @@ -630,7 +716,7 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable( Type *Int64PtrTy = Type::getInt64PtrTy(*Ctx); ArrayType *EdgeTableTy = ArrayType::get(Int64PtrTy, TableSize); - OwningArrayPtr<Constant *> EdgeTable(new Constant*[TableSize]); + std::unique_ptr<Constant * []> EdgeTable(new Constant *[TableSize]); Constant *NullValue = Constant::getNullValue(Int64PtrTy); for (size_t i = 0; i != TableSize; ++i) EdgeTable[i] = NullValue; @@ -666,6 +752,7 @@ Constant *GCOVProfiler::getStartFileFunc() { Type *Args[] = { Type::getInt8PtrTy(*Ctx), // const char *orig_filename Type::getInt8PtrTy(*Ctx), // const char version[4] + Type::getInt32Ty(*Ctx), // uint32_t checksum }; FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false); return M->getOrInsertFunction("llvm_gcda_start_file", FTy); @@ -683,10 +770,12 @@ Constant *GCOVProfiler::getIncrementIndirectCounterFunc() { } Constant *GCOVProfiler::getEmitFunctionFunc() { - Type *Args[3] = { + Type *Args[] = { Type::getInt32Ty(*Ctx), // uint32_t ident Type::getInt8PtrTy(*Ctx), // const char *function_name + Type::getInt32Ty(*Ctx), // uint32_t func_checksum Type::getInt8Ty(*Ctx), // uint8_t use_extra_checksum + Type::getInt32Ty(*Ctx), // uint32_t cfg_checksum }; FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false); return M->getOrInsertFunction("llvm_gcda_emit_function", FTy); @@ -760,17 +849,22 @@ Function *GCOVProfiler::insertCounterWriteout( for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) { DICompileUnit CU(CU_Nodes->getOperand(i)); std::string FilenameGcda = mangleName(CU, "gcda"); - Builder.CreateCall2(StartFile, + uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i]; + Builder.CreateCall3(StartFile, Builder.CreateGlobalStringPtr(FilenameGcda), - Builder.CreateGlobalStringPtr(ReversedVersion)); + Builder.CreateGlobalStringPtr(ReversedVersion), + Builder.getInt32(CfgChecksum)); for (unsigned j = 0, e = CountersBySP.size(); j != e; ++j) { DISubprogram SP(CountersBySP[j].second); - Builder.CreateCall3( + uint32_t FuncChecksum = Funcs.empty() ? 0 : Funcs[j]->getFuncChecksum(); + Builder.CreateCall5( EmitFunction, Builder.getInt32(j), Options.FunctionNamesInData ? Builder.CreateGlobalStringPtr(getFunctionName(SP)) : Constant::getNullValue(Builder.getInt8PtrTy()), - Builder.getInt8(Options.UseCfgChecksum)); + Builder.getInt32(FuncChecksum), + Builder.getInt8(Options.UseCfgChecksum), + Builder.getInt32(CfgChecksum)); GlobalVariable *GV = CountersBySP[j].first; unsigned Arcs = @@ -818,7 +912,7 @@ void GCOVProfiler::insertIndirectCounterIncrement() { // uint64_t *counter = counters[pred]; // if (!counter) return; Value *ZExtPred = Builder.CreateZExt(Pred, Builder.getInt64Ty()); - Arg = llvm::next(Fn->arg_begin()); + Arg = std::next(Fn->arg_begin()); Arg->setName("counters"); Value *GEP = Builder.CreateGEP(Arg, ZExtPred); Value *Counter = Builder.CreateLoad(GEP, "counter"); diff --git a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp index b1bea38..ac1dd43 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -13,8 +13,8 @@ //===----------------------------------------------------------------------===// #include "llvm/InitializePasses.h" -#include "llvm/PassRegistry.h" #include "llvm-c/Initialization.h" +#include "llvm/PassRegistry.h" using namespace llvm; diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index d547adc..57e308c 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -10,8 +10,6 @@ /// This file is a part of MemorySanitizer, a detector of uninitialized /// reads. /// -/// Status: early prototype. -/// /// The algorithm of the tool is similar to Memcheck /// (http://goo.gl/QKbem). We associate a few shadow bits with every /// byte of the application memory, poison the shadow of the malloc-ed @@ -93,24 +91,23 @@ //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "msan" - #include "llvm/Transforms/Instrumentation.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" -#include "llvm/ADT/ValueMap.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" -#include "llvm/InstVisitor.h" +#include "llvm/IR/ValueMap.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" @@ -118,10 +115,11 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ModuleUtils.h" -#include "llvm/Transforms/Utils/SpecialCaseList.h" using namespace llvm; +#define DEBUG_TYPE "msan" + static const uint64_t kShadowMask32 = 1ULL << 31; static const uint64_t kShadowMask64 = 1ULL << 46; static const uint64_t kOriginOffset32 = 1ULL << 30; @@ -129,13 +127,16 @@ static const uint64_t kOriginOffset64 = 1ULL << 45; static const unsigned kMinOriginAlignment = 4; static const unsigned kShadowTLSAlignment = 8; +// Accesses sizes are powers of two: 1, 2, 4, 8. +static const size_t kNumberOfAccessSizes = 4; + /// \brief Track origins of uninitialized values. /// /// Adds a section to MemorySanitizer report that points to the allocation /// (stack or heap) the uninitialized bits came from originally. -static cl::opt<bool> ClTrackOrigins("msan-track-origins", +static cl::opt<int> ClTrackOrigins("msan-track-origins", cl::desc("Track origins (allocation sites) of poisoned memory"), - cl::Hidden, cl::init(false)); + cl::Hidden, cl::init(0)); static cl::opt<bool> ClKeepGoing("msan-keep-going", cl::desc("keep going after reporting a UMR"), cl::Hidden, cl::init(false)); @@ -160,10 +161,6 @@ static cl::opt<bool> ClHandleICmpExact("msan-handle-icmp-exact", cl::desc("exact handling of relational integer ICmp"), cl::Hidden, cl::init(false)); -static cl::opt<bool> ClStoreCleanOrigin("msan-store-clean-origin", - cl::desc("store origin for clean (fully initialized) values"), - cl::Hidden, cl::init(false)); - // This flag controls whether we check the shadow of the address // operand of load or store. Such bugs are very rare, since load from // a garbage address typically results in SEGV, but still happen @@ -178,9 +175,13 @@ static cl::opt<bool> ClDumpStrictInstructions("msan-dump-strict-instructions", cl::desc("print out instructions with default strict semantics"), cl::Hidden, cl::init(false)); -static cl::opt<std::string> ClBlacklistFile("msan-blacklist", - cl::desc("File containing the list of functions where MemorySanitizer " - "should not report bugs"), cl::Hidden); +static cl::opt<int> ClInstrumentationWithCallThreshold( + "msan-instrumentation-with-call-threshold", + cl::desc( + "If the function being instrumented requires more than " + "this number of checks and origin stores, use callbacks instead of " + "inline checks (-1 means never use callbacks)."), + cl::Hidden, cl::init(3500)); // Experimental. Wraps all indirect calls in the instrumented code with // a call to the given function. This is needed to assist the dynamic @@ -203,26 +204,24 @@ namespace { /// uninitialized reads. class MemorySanitizer : public FunctionPass { public: - MemorySanitizer(bool TrackOrigins = false, - StringRef BlacklistFile = StringRef()) + MemorySanitizer(int TrackOrigins = 0) : FunctionPass(ID), - TrackOrigins(TrackOrigins || ClTrackOrigins), - TD(0), - WarningFn(0), - BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile : BlacklistFile), + TrackOrigins(std::max(TrackOrigins, (int)ClTrackOrigins)), + DL(nullptr), + WarningFn(nullptr), WrapIndirectCalls(!ClWrapIndirectCalls.empty()) {} - const char *getPassName() const { return "MemorySanitizer"; } - bool runOnFunction(Function &F); - bool doInitialization(Module &M); + const char *getPassName() const override { return "MemorySanitizer"; } + bool runOnFunction(Function &F) override; + bool doInitialization(Module &M) override; static char ID; // Pass identification, replacement for typeid. private: void initializeCallbacks(Module &M); /// \brief Track origins (allocation points) of uninitialized values. - bool TrackOrigins; + int TrackOrigins; - DataLayout *TD; + const DataLayout *DL; LLVMContext *C; Type *IntptrTy; Type *OriginTy; @@ -249,13 +248,18 @@ class MemorySanitizer : public FunctionPass { /// \brief The run-time callback to print a warning. Value *WarningFn; - /// \brief Run-time helper that copies origin info for a memory range. - Value *MsanCopyOriginFn; + // These arrays are indexed by log2(AccessSize). + Value *MaybeWarningFn[kNumberOfAccessSizes]; + Value *MaybeStoreOriginFn[kNumberOfAccessSizes]; + /// \brief Run-time helper that generates a new origin value for a stack /// allocation. Value *MsanSetAllocaOrigin4Fn; /// \brief Run-time helper that poisons stack on function entry. Value *MsanPoisonStackFn; + /// \brief Run-time helper that records a store (or any event) of an + /// uninitialized value and returns an updated origin id encoding this info. + Value *MsanChainOriginFn; /// \brief MSan runtime replacements for memmove, memcpy and memset. Value *MemmoveFn, *MemcpyFn, *MemsetFn; @@ -269,10 +273,6 @@ class MemorySanitizer : public FunctionPass { MDNode *ColdCallWeights; /// \brief Branch weights for origin store. MDNode *OriginStoreWeights; - /// \brief Path to blacklist file. - SmallString<64> BlacklistFile; - /// \brief The blacklist. - OwningPtr<SpecialCaseList> BL; /// \brief An empty volatile inline asm that prevents callback merge. InlineAsm *EmptyAsm; @@ -292,9 +292,8 @@ INITIALIZE_PASS(MemorySanitizer, "msan", "MemorySanitizer: detects uninitialized reads.", false, false) -FunctionPass *llvm::createMemorySanitizerPass(bool TrackOrigins, - StringRef BlacklistFile) { - return new MemorySanitizer(TrackOrigins, BlacklistFile); +FunctionPass *llvm::createMemorySanitizerPass(int TrackOrigins) { + return new MemorySanitizer(TrackOrigins); } /// \brief Create a non-const global initialized with the given string. @@ -324,14 +323,27 @@ void MemorySanitizer::initializeCallbacks(Module &M) { : "__msan_warning_noreturn"; WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), NULL); - MsanCopyOriginFn = M.getOrInsertFunction( - "__msan_copy_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(), - IRB.getInt8PtrTy(), IntptrTy, NULL); + for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes; + AccessSizeIndex++) { + unsigned AccessSize = 1 << AccessSizeIndex; + std::string FunctionName = "__msan_maybe_warning_" + itostr(AccessSize); + MaybeWarningFn[AccessSizeIndex] = M.getOrInsertFunction( + FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8), + IRB.getInt32Ty(), NULL); + + FunctionName = "__msan_maybe_store_origin_" + itostr(AccessSize); + MaybeStoreOriginFn[AccessSizeIndex] = M.getOrInsertFunction( + FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8), + IRB.getInt8PtrTy(), IRB.getInt32Ty(), NULL); + } + MsanSetAllocaOrigin4Fn = M.getOrInsertFunction( "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, IRB.getInt8PtrTy(), IntptrTy, NULL); MsanPoisonStackFn = M.getOrInsertFunction( "__msan_poison_stack", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, NULL); + MsanChainOriginFn = M.getOrInsertFunction( + "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty(), NULL); MemmoveFn = M.getOrInsertFunction( "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, NULL); @@ -345,31 +357,32 @@ void MemorySanitizer::initializeCallbacks(Module &M) { // Create globals. RetvalTLS = new GlobalVariable( M, ArrayType::get(IRB.getInt64Ty(), 8), false, - GlobalVariable::ExternalLinkage, 0, "__msan_retval_tls", 0, + GlobalVariable::ExternalLinkage, nullptr, "__msan_retval_tls", nullptr, GlobalVariable::InitialExecTLSModel); RetvalOriginTLS = new GlobalVariable( - M, OriginTy, false, GlobalVariable::ExternalLinkage, 0, - "__msan_retval_origin_tls", 0, GlobalVariable::InitialExecTLSModel); + M, OriginTy, false, GlobalVariable::ExternalLinkage, nullptr, + "__msan_retval_origin_tls", nullptr, GlobalVariable::InitialExecTLSModel); ParamTLS = new GlobalVariable( M, ArrayType::get(IRB.getInt64Ty(), 1000), false, - GlobalVariable::ExternalLinkage, 0, "__msan_param_tls", 0, + GlobalVariable::ExternalLinkage, nullptr, "__msan_param_tls", nullptr, GlobalVariable::InitialExecTLSModel); ParamOriginTLS = new GlobalVariable( M, ArrayType::get(OriginTy, 1000), false, GlobalVariable::ExternalLinkage, - 0, "__msan_param_origin_tls", 0, GlobalVariable::InitialExecTLSModel); + nullptr, "__msan_param_origin_tls", nullptr, + GlobalVariable::InitialExecTLSModel); VAArgTLS = new GlobalVariable( M, ArrayType::get(IRB.getInt64Ty(), 1000), false, - GlobalVariable::ExternalLinkage, 0, "__msan_va_arg_tls", 0, + GlobalVariable::ExternalLinkage, nullptr, "__msan_va_arg_tls", nullptr, GlobalVariable::InitialExecTLSModel); VAArgOverflowSizeTLS = new GlobalVariable( - M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, 0, - "__msan_va_arg_overflow_size_tls", 0, + M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, nullptr, + "__msan_va_arg_overflow_size_tls", nullptr, GlobalVariable::InitialExecTLSModel); OriginTLS = new GlobalVariable( - M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, 0, - "__msan_origin_tls", 0, GlobalVariable::InitialExecTLSModel); + M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, nullptr, + "__msan_origin_tls", nullptr, GlobalVariable::InitialExecTLSModel); // We insert an empty inline asm after __msan_report* to avoid callback merge. EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), @@ -383,14 +396,14 @@ void MemorySanitizer::initializeCallbacks(Module &M) { ClWrapIndirectCalls, AnyFunctionPtrTy, AnyFunctionPtrTy, NULL); } - if (ClWrapIndirectCallsFast) { + if (WrapIndirectCalls && ClWrapIndirectCallsFast) { MsandrModuleStart = new GlobalVariable( M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage, - 0, "__executable_start"); + nullptr, "__executable_start"); MsandrModuleStart->setVisibility(GlobalVariable::HiddenVisibility); MsandrModuleEnd = new GlobalVariable( M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage, - 0, "_end"); + nullptr, "_end"); MsandrModuleEnd->setVisibility(GlobalVariable::HiddenVisibility); } } @@ -399,12 +412,13 @@ void MemorySanitizer::initializeCallbacks(Module &M) { /// /// inserts a call to __msan_init to the module's constructor list. bool MemorySanitizer::doInitialization(Module &M) { - TD = getAnalysisIfAvailable<DataLayout>(); - if (!TD) - return false; - BL.reset(SpecialCaseList::createOrDie(BlacklistFile)); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + if (!DLP) + report_fatal_error("data layout missing"); + DL = &DLP->getDataLayout(); + C = &(M.getContext()); - unsigned PtrSize = TD->getPointerSizeInBits(/* AddressSpace */0); + unsigned PtrSize = DL->getPointerSizeInBits(/* AddressSpace */0); switch (PtrSize) { case 64: ShadowMask = kShadowMask64; @@ -420,7 +434,7 @@ bool MemorySanitizer::doInitialization(Module &M) { } IRBuilder<> IRB(*C); - IntptrTy = IRB.getIntPtrTy(TD); + IntptrTy = IRB.getIntPtrTy(DL); OriginTy = IRB.getInt32Ty(); ColdCallWeights = MDBuilder(*C).createBranchWeights(1, 1000); @@ -476,6 +490,11 @@ VarArgHelper* CreateVarArgHelper(Function &Func, MemorySanitizer &Msan, MemorySanitizerVisitor &Visitor); +unsigned TypeSizeToSizeIndex(unsigned TypeSize) { + if (TypeSize <= 8) return 0; + return Log2_32_Ceil(TypeSize / 8); +} + /// This class does all the work for a given function. Store and Load /// instructions store and load corresponding shadow and origin /// values. Most instructions propagate shadow from arguments to their @@ -487,12 +506,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { MemorySanitizer &MS; SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes; ValueMap<Value*, Value*> ShadowMap, OriginMap; - OwningPtr<VarArgHelper> VAHelper; + std::unique_ptr<VarArgHelper> VAHelper; // The following flags disable parts of MSan instrumentation based on // blacklist contents and command-line options. bool InsertChecks; - bool LoadShadow; + bool PropagateShadow; bool PoisonStack; bool PoisonUndef; bool CheckReturnValue; @@ -503,7 +522,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Instruction *OrigIns; ShadowOriginAndInsertPoint(Value *S, Value *O, Instruction *I) : Shadow(S), Origin(O), OrigIns(I) { } - ShadowOriginAndInsertPoint() : Shadow(0), Origin(0), OrigIns(0) { } }; SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList; SmallVector<Instruction*, 16> StoreList; @@ -511,11 +529,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { MemorySanitizerVisitor(Function &F, MemorySanitizer &MS) : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) { - bool SanitizeFunction = !MS.BL->isIn(F) && F.getAttributes().hasAttribute( - AttributeSet::FunctionIndex, - Attribute::SanitizeMemory); + bool SanitizeFunction = F.getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::SanitizeMemory); InsertChecks = SanitizeFunction; - LoadShadow = SanitizeFunction; + PropagateShadow = SanitizeFunction; PoisonStack = SanitizeFunction && ClPoisonStack; PoisonUndef = SanitizeFunction && ClPoisonUndef; // FIXME: Consider using SpecialCaseList to specify a list of functions that @@ -527,89 +544,119 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { << F.getName() << "'\n"); } - void materializeStores() { - for (size_t i = 0, n = StoreList.size(); i < n; i++) { - StoreInst& I = *dyn_cast<StoreInst>(StoreList[i]); + Value *updateOrigin(Value *V, IRBuilder<> &IRB) { + if (MS.TrackOrigins <= 1) return V; + return IRB.CreateCall(MS.MsanChainOriginFn, V); + } - IRBuilder<> IRB(&I); - Value *Val = I.getValueOperand(); - Value *Addr = I.getPointerOperand(); - Value *Shadow = I.isAtomic() ? getCleanShadow(Val) : getShadow(Val); + void storeOrigin(IRBuilder<> &IRB, Value *Addr, Value *Shadow, Value *Origin, + unsigned Alignment, bool AsCall) { + if (isa<StructType>(Shadow->getType())) { + IRB.CreateAlignedStore(updateOrigin(Origin, IRB), getOriginPtr(Addr, IRB), + Alignment); + } else { + Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); + // TODO(eugenis): handle non-zero constant shadow by inserting an + // unconditional check (can not simply fail compilation as this could + // be in the dead code). + if (isa<Constant>(ConvertedShadow)) return; + unsigned TypeSizeInBits = + MS.DL->getTypeSizeInBits(ConvertedShadow->getType()); + unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits); + if (AsCall && SizeIndex < kNumberOfAccessSizes) { + Value *Fn = MS.MaybeStoreOriginFn[SizeIndex]; + Value *ConvertedShadow2 = IRB.CreateZExt( + ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex))); + IRB.CreateCall3(Fn, ConvertedShadow2, + IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), + Origin); + } else { + Value *Cmp = IRB.CreateICmpNE( + ConvertedShadow, getCleanShadow(ConvertedShadow), "_mscmp"); + Instruction *CheckTerm = SplitBlockAndInsertIfThen( + Cmp, IRB.GetInsertPoint(), false, MS.OriginStoreWeights); + IRBuilder<> IRBNew(CheckTerm); + IRBNew.CreateAlignedStore(updateOrigin(Origin, IRBNew), + getOriginPtr(Addr, IRBNew), Alignment); + } + } + } + + void materializeStores(bool InstrumentWithCalls) { + for (auto Inst : StoreList) { + StoreInst &SI = *dyn_cast<StoreInst>(Inst); + + IRBuilder<> IRB(&SI); + Value *Val = SI.getValueOperand(); + Value *Addr = SI.getPointerOperand(); + Value *Shadow = SI.isAtomic() ? getCleanShadow(Val) : getShadow(Val); Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB); StoreInst *NewSI = - IRB.CreateAlignedStore(Shadow, ShadowPtr, I.getAlignment()); + IRB.CreateAlignedStore(Shadow, ShadowPtr, SI.getAlignment()); DEBUG(dbgs() << " STORE: " << *NewSI << "\n"); (void)NewSI; - if (ClCheckAccessAddress) - insertShadowCheck(Addr, &I); + if (ClCheckAccessAddress) insertShadowCheck(Addr, &SI); - if (I.isAtomic()) - I.setOrdering(addReleaseOrdering(I.getOrdering())); + if (SI.isAtomic()) SI.setOrdering(addReleaseOrdering(SI.getOrdering())); if (MS.TrackOrigins) { - unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment()); - if (ClStoreCleanOrigin || isa<StructType>(Shadow->getType())) { - IRB.CreateAlignedStore(getOrigin(Val), getOriginPtr(Addr, IRB), - Alignment); - } else { - Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); - - // TODO(eugenis): handle non-zero constant shadow by inserting an - // unconditional check (can not simply fail compilation as this could - // be in the dead code). - if (isa<Constant>(ConvertedShadow)) - continue; - - Value *Cmp = IRB.CreateICmpNE(ConvertedShadow, - getCleanShadow(ConvertedShadow), "_mscmp"); - Instruction *CheckTerm = - SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false, - MS.OriginStoreWeights); - IRBuilder<> IRBNew(CheckTerm); - IRBNew.CreateAlignedStore(getOrigin(Val), getOriginPtr(Addr, IRBNew), - Alignment); - } + unsigned Alignment = std::max(kMinOriginAlignment, SI.getAlignment()); + storeOrigin(IRB, Addr, Shadow, getOrigin(Val), Alignment, + InstrumentWithCalls); } } } - void materializeChecks() { - for (size_t i = 0, n = InstrumentationList.size(); i < n; i++) { - Value *Shadow = InstrumentationList[i].Shadow; - Instruction *OrigIns = InstrumentationList[i].OrigIns; - IRBuilder<> IRB(OrigIns); - DEBUG(dbgs() << " SHAD0 : " << *Shadow << "\n"); - Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); - DEBUG(dbgs() << " SHAD1 : " << *ConvertedShadow << "\n"); - // See the comment in materializeStores(). - if (isa<Constant>(ConvertedShadow)) - continue; + void materializeOneCheck(Instruction *OrigIns, Value *Shadow, Value *Origin, + bool AsCall) { + IRBuilder<> IRB(OrigIns); + DEBUG(dbgs() << " SHAD0 : " << *Shadow << "\n"); + Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); + DEBUG(dbgs() << " SHAD1 : " << *ConvertedShadow << "\n"); + // See the comment in materializeStores(). + if (isa<Constant>(ConvertedShadow)) return; + unsigned TypeSizeInBits = + MS.DL->getTypeSizeInBits(ConvertedShadow->getType()); + unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits); + if (AsCall && SizeIndex < kNumberOfAccessSizes) { + Value *Fn = MS.MaybeWarningFn[SizeIndex]; + Value *ConvertedShadow2 = + IRB.CreateZExt(ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex))); + IRB.CreateCall2(Fn, ConvertedShadow2, MS.TrackOrigins && Origin + ? Origin + : (Value *)IRB.getInt32(0)); + } else { Value *Cmp = IRB.CreateICmpNE(ConvertedShadow, getCleanShadow(ConvertedShadow), "_mscmp"); - Instruction *CheckTerm = - SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), - /* Unreachable */ !ClKeepGoing, - MS.ColdCallWeights); + Instruction *CheckTerm = SplitBlockAndInsertIfThen( + Cmp, OrigIns, + /* Unreachable */ !ClKeepGoing, MS.ColdCallWeights); IRB.SetInsertPoint(CheckTerm); if (MS.TrackOrigins) { - Value *Origin = InstrumentationList[i].Origin; - IRB.CreateStore(Origin ? (Value*)Origin : (Value*)IRB.getInt32(0), + IRB.CreateStore(Origin ? (Value *)Origin : (Value *)IRB.getInt32(0), MS.OriginTLS); } - CallInst *Call = IRB.CreateCall(MS.WarningFn); - Call->setDebugLoc(OrigIns->getDebugLoc()); + IRB.CreateCall(MS.WarningFn); IRB.CreateCall(MS.EmptyAsm); DEBUG(dbgs() << " CHECK: " << *Cmp << "\n"); } + } + + void materializeChecks(bool InstrumentWithCalls) { + for (const auto &ShadowData : InstrumentationList) { + Instruction *OrigIns = ShadowData.OrigIns; + Value *Shadow = ShadowData.Shadow; + Value *Origin = ShadowData.Origin; + materializeOneCheck(OrigIns, Shadow, Origin, InstrumentWithCalls); + } DEBUG(dbgs() << "DONE:\n" << F); } void materializeIndirectCalls() { - for (size_t i = 0, n = IndirectCallList.size(); i < n; i++) { - CallSite CS = IndirectCallList[i]; + for (auto &CS : IndirectCallList) { Instruction *I = CS.getInstruction(); BasicBlock *B = I->getParent(); IRBuilder<> IRB(I); @@ -629,7 +676,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRB.CreatePHI(Fn0->getType(), 2, "msandr.indirect_target"); Instruction *CheckTerm = SplitBlockAndInsertIfThen( - cast<Instruction>(NotInThisModule), + NotInThisModule, NewFnPhi, /* Unreachable */ false, MS.ColdCallWeights); IRB.SetInsertPoint(CheckTerm); @@ -652,7 +699,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// \brief Add MemorySanitizer instrumentation to a function. bool runOnFunction() { MS.initializeCallbacks(*F.getParent()); - if (!MS.TD) return false; + if (!MS.DL) return false; // In the presence of unreachable blocks, we may see Phi nodes with // incoming nodes from such blocks. Since InstVisitor skips unreachable @@ -663,33 +710,33 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // Iterate all BBs in depth-first order and create shadow instructions // for all instructions (where applicable). // For PHI nodes we create dummy shadow PHIs which will be finalized later. - for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()), - DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) { - BasicBlock *BB = *DI; + for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB); - } + // Finalize PHI nodes. - for (size_t i = 0, n = ShadowPHINodes.size(); i < n; i++) { - PHINode *PN = ShadowPHINodes[i]; + for (PHINode *PN : ShadowPHINodes) { PHINode *PNS = cast<PHINode>(getShadow(PN)); - PHINode *PNO = MS.TrackOrigins ? cast<PHINode>(getOrigin(PN)) : 0; + PHINode *PNO = MS.TrackOrigins ? cast<PHINode>(getOrigin(PN)) : nullptr; size_t NumValues = PN->getNumIncomingValues(); for (size_t v = 0; v < NumValues; v++) { PNS->addIncoming(getShadow(PN, v), PN->getIncomingBlock(v)); - if (PNO) - PNO->addIncoming(getOrigin(PN, v), PN->getIncomingBlock(v)); + if (PNO) PNO->addIncoming(getOrigin(PN, v), PN->getIncomingBlock(v)); } } VAHelper->finalizeInstrumentation(); + bool InstrumentWithCalls = ClInstrumentationWithCallThreshold >= 0 && + InstrumentationList.size() + StoreList.size() > + (unsigned)ClInstrumentationWithCallThreshold; + // Delayed instrumentation of StoreInst. // This may add new checks to be inserted later. - materializeStores(); + materializeStores(InstrumentWithCalls); // Insert shadow value checks. - materializeChecks(); + materializeChecks(InstrumentWithCalls); // Wrap indirect calls. materializeIndirectCalls(); @@ -705,14 +752,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// \brief Compute the shadow type that corresponds to a given Type. Type *getShadowTy(Type *OrigTy) { if (!OrigTy->isSized()) { - return 0; + return nullptr; } // For integer type, shadow is the same as the original type. // This may return weird-sized types like i1. if (IntegerType *IT = dyn_cast<IntegerType>(OrigTy)) return IT; if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) { - uint32_t EltSize = MS.TD->getTypeSizeInBits(VT->getElementType()); + uint32_t EltSize = MS.DL->getTypeSizeInBits(VT->getElementType()); return VectorType::get(IntegerType::get(*MS.C, EltSize), VT->getNumElements()); } @@ -724,7 +771,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n"); return Res; } - uint32_t TypeSize = MS.TD->getTypeSizeInBits(OrigTy); + uint32_t TypeSize = MS.DL->getTypeSizeInBits(OrigTy); return IntegerType::get(*MS.C, TypeSize); } @@ -785,7 +832,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// \brief Compute the origin address for a given function argument. Value *getOriginPtrForArgument(Value *A, IRBuilder<> &IRB, int ArgOffset) { - if (!MS.TrackOrigins) return 0; + if (!MS.TrackOrigins) return nullptr; Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy); Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0), @@ -808,7 +855,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// \brief Set SV to be the shadow value for V. void setShadow(Value *V, Value *SV) { assert(!ShadowMap.count(V) && "Values may only have one shadow"); - ShadowMap[V] = SV; + ShadowMap[V] = PropagateShadow ? SV : getCleanShadow(V); } /// \brief Set Origin to be the origin value for V. @@ -826,7 +873,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Constant *getCleanShadow(Value *V) { Type *ShadowTy = getShadowTy(V); if (!ShadowTy) - return 0; + return nullptr; return Constant::getNullValue(ShadowTy); } @@ -846,7 +893,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Constant *getPoisonedShadow(Value *V) { Type *ShadowTy = getShadowTy(V); if (!ShadowTy) - return 0; + return nullptr; return getPoisonedShadow(ShadowTy); } @@ -860,6 +907,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// This function either returns the value set earlier with setShadow, /// or extracts if from ParamTLS (for function arguments). Value *getShadow(Value *V) { + if (!PropagateShadow) return getCleanShadow(V); if (Instruction *I = dyn_cast<Instruction>(V)) { // For instructions the shadow is already stored in the map. Value *Shadow = ShadowMap[V]; @@ -884,25 +932,24 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Function *F = A->getParent(); IRBuilder<> EntryIRB(F->getEntryBlock().getFirstNonPHI()); unsigned ArgOffset = 0; - for (Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end(); - AI != AE; ++AI) { - if (!AI->getType()->isSized()) { + for (auto &FArg : F->args()) { + if (!FArg.getType()->isSized()) { DEBUG(dbgs() << "Arg is not sized\n"); continue; } - unsigned Size = AI->hasByValAttr() - ? MS.TD->getTypeAllocSize(AI->getType()->getPointerElementType()) - : MS.TD->getTypeAllocSize(AI->getType()); - if (A == AI) { - Value *Base = getShadowPtrForArgument(AI, EntryIRB, ArgOffset); - if (AI->hasByValAttr()) { + unsigned Size = FArg.hasByValAttr() + ? MS.DL->getTypeAllocSize(FArg.getType()->getPointerElementType()) + : MS.DL->getTypeAllocSize(FArg.getType()); + if (A == &FArg) { + Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset); + if (FArg.hasByValAttr()) { // ByVal pointer itself has clean shadow. We copy the actual // argument shadow to the underlying memory. // Figure out maximal valid memcpy alignment. - unsigned ArgAlign = AI->getParamAlignment(); + unsigned ArgAlign = FArg.getParamAlignment(); if (ArgAlign == 0) { Type *EltType = A->getType()->getPointerElementType(); - ArgAlign = MS.TD->getABITypeAlignment(EltType); + ArgAlign = MS.DL->getABITypeAlignment(EltType); } unsigned CopyAlign = std::min(ArgAlign, kShadowTLSAlignment); Value *Cpy = EntryIRB.CreateMemCpy( @@ -914,10 +961,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } else { *ShadowPtr = EntryIRB.CreateAlignedLoad(Base, kShadowTLSAlignment); } - DEBUG(dbgs() << " ARG: " << *AI << " ==> " << + DEBUG(dbgs() << " ARG: " << FArg << " ==> " << **ShadowPtr << "\n"); if (MS.TrackOrigins) { - Value* OriginPtr = getOriginPtrForArgument(AI, EntryIRB, ArgOffset); + Value *OriginPtr = + getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset); setOrigin(A, EntryIRB.CreateLoad(OriginPtr)); } } @@ -937,7 +985,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// \brief Get the origin for a value. Value *getOrigin(Value *V) { - if (!MS.TrackOrigins) return 0; + if (!MS.TrackOrigins) return nullptr; if (isa<Instruction>(V) || isa<Argument>(V)) { Value *Origin = OriginMap[V]; if (!Origin) { @@ -1027,7 +1075,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRB(I.getNextNode()); Type *ShadowTy = getShadowTy(&I); Value *Addr = I.getPointerOperand(); - if (LoadShadow) { + if (PropagateShadow) { Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB); setShadow(&I, IRB.CreateAlignedLoad(ShadowPtr, I.getAlignment(), "_msld")); @@ -1042,7 +1090,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { I.setOrdering(addAcquireOrdering(I.getOrdering())); if (MS.TrackOrigins) { - if (LoadShadow) { + if (PropagateShadow) { unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment()); setOrigin(&I, IRB.CreateAlignedLoad(getOriginPtr(Addr, IRB), Alignment)); @@ -1088,7 +1136,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) { handleCASOrRMW(I); - I.setOrdering(addReleaseOrdering(I.getOrdering())); + I.setSuccessOrdering(addReleaseOrdering(I.getSuccessOrdering())); } // Vector manipulation. @@ -1235,7 +1283,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { public: Combiner(MemorySanitizerVisitor *MSV, IRBuilder<> &IRB) : - Shadow(0), Origin(0), IRB(IRB), MSV(MSV) {} + Shadow(nullptr), Origin(nullptr), IRB(IRB), MSV(MSV) {} /// \brief Add a pair of shadow and origin values to the mix. Combiner &Add(Value *OpShadow, Value *OpOrigin) { @@ -1254,10 +1302,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (!Origin) { Origin = OpOrigin; } else { - Value *FlatShadow = MSV->convertToShadowTyNoVec(OpShadow, IRB); - Value *Cond = IRB.CreateICmpNE(FlatShadow, - MSV->getCleanShadow(FlatShadow)); - Origin = IRB.CreateSelect(Cond, OpOrigin, Origin); + Constant *ConstOrigin = dyn_cast<Constant>(OpOrigin); + // No point in adding something that might result in 0 origin value. + if (!ConstOrigin || !ConstOrigin->isNullValue()) { + Value *FlatShadow = MSV->convertToShadowTyNoVec(OpShadow, IRB); + Value *Cond = + IRB.CreateICmpNE(FlatShadow, MSV->getCleanShadow(FlatShadow)); + Origin = IRB.CreateSelect(Cond, OpOrigin, Origin); + } } } return *this; @@ -1266,7 +1318,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// \brief Add an application value to the mix. Combiner &Add(Value *V) { Value *OpShadow = MSV->getShadow(V); - Value *OpOrigin = MSV->MS.TrackOrigins ? MSV->getOrigin(V) : 0; + Value *OpOrigin = MSV->MS.TrackOrigins ? MSV->getOrigin(V) : nullptr; return Add(OpShadow, OpOrigin); } @@ -1325,6 +1377,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // TODO: handle struct types. } + /// \brief Cast an application value to the type of its own shadow. + Value *CreateAppToShadowCast(IRBuilder<> &IRB, Value *V) { + Type *ShadowTy = getShadowTy(V); + if (V->getType() == ShadowTy) + return V; + if (V->getType()->isPtrOrPtrVectorTy()) + return IRB.CreatePtrToInt(V, ShadowTy); + else + return IRB.CreateBitCast(V, ShadowTy); + } + /// \brief Propagate shadow for arbitrary operation. void handleShadowOr(Instruction &I) { IRBuilder<> IRB(&I); @@ -1334,13 +1397,61 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { SC.Done(&I); } + // \brief Handle multiplication by constant. + // + // Handle a special case of multiplication by constant that may have one or + // more zeros in the lower bits. This makes corresponding number of lower bits + // of the result zero as well. We model it by shifting the other operand + // shadow left by the required number of bits. Effectively, we transform + // (X * (A * 2**B)) to ((X << B) * A) and instrument (X << B) as (Sx << B). + // We use multiplication by 2**N instead of shift to cover the case of + // multiplication by 0, which may occur in some elements of a vector operand. + void handleMulByConstant(BinaryOperator &I, Constant *ConstArg, + Value *OtherArg) { + Constant *ShadowMul; + Type *Ty = ConstArg->getType(); + if (Ty->isVectorTy()) { + unsigned NumElements = Ty->getVectorNumElements(); + Type *EltTy = Ty->getSequentialElementType(); + SmallVector<Constant *, 16> Elements; + for (unsigned Idx = 0; Idx < NumElements; ++Idx) { + ConstantInt *Elt = + dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx)); + APInt V = Elt->getValue(); + APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros(); + Elements.push_back(ConstantInt::get(EltTy, V2)); + } + ShadowMul = ConstantVector::get(Elements); + } else { + ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg); + APInt V = Elt->getValue(); + APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros(); + ShadowMul = ConstantInt::get(Elt->getType(), V2); + } + + IRBuilder<> IRB(&I); + setShadow(&I, + IRB.CreateMul(getShadow(OtherArg), ShadowMul, "msprop_mul_cst")); + setOrigin(&I, getOrigin(OtherArg)); + } + + void visitMul(BinaryOperator &I) { + Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0)); + Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1)); + if (constOp0 && !constOp1) + handleMulByConstant(I, constOp0, I.getOperand(1)); + else if (constOp1 && !constOp0) + handleMulByConstant(I, constOp1, I.getOperand(0)); + else + handleShadowOr(I); + } + void visitFAdd(BinaryOperator &I) { handleShadowOr(I); } void visitFSub(BinaryOperator &I) { handleShadowOr(I); } void visitFMul(BinaryOperator &I) { handleShadowOr(I); } void visitAdd(BinaryOperator &I) { handleShadowOr(I); } void visitSub(BinaryOperator &I) { handleShadowOr(I); } void visitXor(BinaryOperator &I) { handleShadowOr(I); } - void visitMul(BinaryOperator &I) { handleShadowOr(I); } void handleDiv(Instruction &I) { IRBuilder<> IRB(&I); @@ -1470,7 +1581,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void handleSignedRelationalComparison(ICmpInst &I) { Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0)); Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1)); - Value* op = NULL; + Value* op = nullptr; CmpInst::Predicate pre = I.getPredicate(); if (constOp0 && constOp0->isNullValue() && (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE)) { @@ -1646,7 +1757,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *Addr = I.getArgOperand(0); Type *ShadowTy = getShadowTy(&I); - if (LoadShadow) { + if (PropagateShadow) { Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB); // We don't know the pointer alignment (could be unaligned SSE load!). // Have to assume to worst case. @@ -1659,7 +1770,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { insertShadowCheck(Addr, &I); if (MS.TrackOrigins) { - if (LoadShadow) + if (PropagateShadow) setOrigin(&I, IRB.CreateLoad(getOriginPtr(Addr, IRB))); else setOrigin(&I, getCleanOrigin()); @@ -1779,7 +1890,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { break; case 1: ConvertOp = I.getArgOperand(0); - CopyOp = NULL; + CopyOp = nullptr; break; default: llvm_unreachable("Cvt intrinsic with unsupported number of arguments."); @@ -1793,7 +1904,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // FIXME: consider propagating shadow of ConvertOp, at least in the case of // int->any conversion. Value *ConvertShadow = getShadow(ConvertOp); - Value *AggShadow = 0; + Value *AggShadow = nullptr; if (ConvertOp->getType()->isVectorTy()) { AggShadow = IRB.CreateExtractElement( ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), 0)); @@ -1827,6 +1938,162 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } } + // Given a scalar or vector, extract lower 64 bits (or less), and return all + // zeroes if it is zero, and all ones otherwise. + Value *Lower64ShadowExtend(IRBuilder<> &IRB, Value *S, Type *T) { + if (S->getType()->isVectorTy()) + S = CreateShadowCast(IRB, S, IRB.getInt64Ty(), /* Signed */ true); + assert(S->getType()->getPrimitiveSizeInBits() <= 64); + Value *S2 = IRB.CreateICmpNE(S, getCleanShadow(S)); + return CreateShadowCast(IRB, S2, T, /* Signed */ true); + } + + Value *VariableShadowExtend(IRBuilder<> &IRB, Value *S) { + Type *T = S->getType(); + assert(T->isVectorTy()); + Value *S2 = IRB.CreateICmpNE(S, getCleanShadow(S)); + return IRB.CreateSExt(S2, T); + } + + // \brief Instrument vector shift instrinsic. + // + // This function instruments intrinsics like int_x86_avx2_psll_w. + // Intrinsic shifts %In by %ShiftSize bits. + // %ShiftSize may be a vector. In that case the lower 64 bits determine shift + // size, and the rest is ignored. Behavior is defined even if shift size is + // greater than register (or field) width. + void handleVectorShiftIntrinsic(IntrinsicInst &I, bool Variable) { + assert(I.getNumArgOperands() == 2); + IRBuilder<> IRB(&I); + // If any of the S2 bits are poisoned, the whole thing is poisoned. + // Otherwise perform the same shift on S1. + Value *S1 = getShadow(&I, 0); + Value *S2 = getShadow(&I, 1); + Value *S2Conv = Variable ? VariableShadowExtend(IRB, S2) + : Lower64ShadowExtend(IRB, S2, getShadowTy(&I)); + Value *V1 = I.getOperand(0); + Value *V2 = I.getOperand(1); + Value *Shift = IRB.CreateCall2(I.getCalledValue(), + IRB.CreateBitCast(S1, V1->getType()), V2); + Shift = IRB.CreateBitCast(Shift, getShadowTy(&I)); + setShadow(&I, IRB.CreateOr(Shift, S2Conv)); + setOriginForNaryOp(I); + } + + // \brief Get an X86_MMX-sized vector type. + Type *getMMXVectorTy(unsigned EltSizeInBits) { + const unsigned X86_MMXSizeInBits = 64; + return VectorType::get(IntegerType::get(*MS.C, EltSizeInBits), + X86_MMXSizeInBits / EltSizeInBits); + } + + // \brief Returns a signed counterpart for an (un)signed-saturate-and-pack + // intrinsic. + Intrinsic::ID getSignedPackIntrinsic(Intrinsic::ID id) { + switch (id) { + case llvm::Intrinsic::x86_sse2_packsswb_128: + case llvm::Intrinsic::x86_sse2_packuswb_128: + return llvm::Intrinsic::x86_sse2_packsswb_128; + + case llvm::Intrinsic::x86_sse2_packssdw_128: + case llvm::Intrinsic::x86_sse41_packusdw: + return llvm::Intrinsic::x86_sse2_packssdw_128; + + case llvm::Intrinsic::x86_avx2_packsswb: + case llvm::Intrinsic::x86_avx2_packuswb: + return llvm::Intrinsic::x86_avx2_packsswb; + + case llvm::Intrinsic::x86_avx2_packssdw: + case llvm::Intrinsic::x86_avx2_packusdw: + return llvm::Intrinsic::x86_avx2_packssdw; + + case llvm::Intrinsic::x86_mmx_packsswb: + case llvm::Intrinsic::x86_mmx_packuswb: + return llvm::Intrinsic::x86_mmx_packsswb; + + case llvm::Intrinsic::x86_mmx_packssdw: + return llvm::Intrinsic::x86_mmx_packssdw; + default: + llvm_unreachable("unexpected intrinsic id"); + } + } + + // \brief Instrument vector pack instrinsic. + // + // This function instruments intrinsics like x86_mmx_packsswb, that + // packs elements of 2 input vectors into half as many bits with saturation. + // Shadow is propagated with the signed variant of the same intrinsic applied + // to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer). + // EltSizeInBits is used only for x86mmx arguments. + void handleVectorPackIntrinsic(IntrinsicInst &I, unsigned EltSizeInBits = 0) { + assert(I.getNumArgOperands() == 2); + bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy(); + IRBuilder<> IRB(&I); + Value *S1 = getShadow(&I, 0); + Value *S2 = getShadow(&I, 1); + assert(isX86_MMX || S1->getType()->isVectorTy()); + + // SExt and ICmpNE below must apply to individual elements of input vectors. + // In case of x86mmx arguments, cast them to appropriate vector types and + // back. + Type *T = isX86_MMX ? getMMXVectorTy(EltSizeInBits) : S1->getType(); + if (isX86_MMX) { + S1 = IRB.CreateBitCast(S1, T); + S2 = IRB.CreateBitCast(S2, T); + } + Value *S1_ext = IRB.CreateSExt( + IRB.CreateICmpNE(S1, llvm::Constant::getNullValue(T)), T); + Value *S2_ext = IRB.CreateSExt( + IRB.CreateICmpNE(S2, llvm::Constant::getNullValue(T)), T); + if (isX86_MMX) { + Type *X86_MMXTy = Type::getX86_MMXTy(*MS.C); + S1_ext = IRB.CreateBitCast(S1_ext, X86_MMXTy); + S2_ext = IRB.CreateBitCast(S2_ext, X86_MMXTy); + } + + Function *ShadowFn = Intrinsic::getDeclaration( + F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID())); + + Value *S = IRB.CreateCall2(ShadowFn, S1_ext, S2_ext, "_msprop_vector_pack"); + if (isX86_MMX) S = IRB.CreateBitCast(S, getShadowTy(&I)); + setShadow(&I, S); + setOriginForNaryOp(I); + } + + // \brief Instrument sum-of-absolute-differencies intrinsic. + void handleVectorSadIntrinsic(IntrinsicInst &I) { + const unsigned SignificantBitsPerResultElement = 16; + bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy(); + Type *ResTy = isX86_MMX ? IntegerType::get(*MS.C, 64) : I.getType(); + unsigned ZeroBitsPerResultElement = + ResTy->getScalarSizeInBits() - SignificantBitsPerResultElement; + + IRBuilder<> IRB(&I); + Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1)); + S = IRB.CreateBitCast(S, ResTy); + S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)), + ResTy); + S = IRB.CreateLShr(S, ZeroBitsPerResultElement); + S = IRB.CreateBitCast(S, getShadowTy(&I)); + setShadow(&I, S); + setOriginForNaryOp(I); + } + + // \brief Instrument multiply-add intrinsic. + void handleVectorPmaddIntrinsic(IntrinsicInst &I, + unsigned EltSizeInBits = 0) { + bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy(); + Type *ResTy = isX86_MMX ? getMMXVectorTy(EltSizeInBits * 2) : I.getType(); + IRBuilder<> IRB(&I); + Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1)); + S = IRB.CreateBitCast(S, ResTy); + S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)), + ResTy); + S = IRB.CreateBitCast(S, getShadowTy(&I)); + setShadow(&I, S); + setOriginForNaryOp(I); + } + void visitIntrinsicInst(IntrinsicInst &I) { switch (I.getIntrinsicID()) { case llvm::Intrinsic::bswap: @@ -1866,6 +2133,124 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case llvm::Intrinsic::x86_sse_cvttps2pi: handleVectorConvertIntrinsic(I, 2); break; + case llvm::Intrinsic::x86_avx512_psll_dq: + case llvm::Intrinsic::x86_avx512_psrl_dq: + case llvm::Intrinsic::x86_avx2_psll_w: + case llvm::Intrinsic::x86_avx2_psll_d: + case llvm::Intrinsic::x86_avx2_psll_q: + case llvm::Intrinsic::x86_avx2_pslli_w: + case llvm::Intrinsic::x86_avx2_pslli_d: + case llvm::Intrinsic::x86_avx2_pslli_q: + case llvm::Intrinsic::x86_avx2_psll_dq: + case llvm::Intrinsic::x86_avx2_psrl_w: + case llvm::Intrinsic::x86_avx2_psrl_d: + case llvm::Intrinsic::x86_avx2_psrl_q: + case llvm::Intrinsic::x86_avx2_psra_w: + case llvm::Intrinsic::x86_avx2_psra_d: + case llvm::Intrinsic::x86_avx2_psrli_w: + case llvm::Intrinsic::x86_avx2_psrli_d: + case llvm::Intrinsic::x86_avx2_psrli_q: + case llvm::Intrinsic::x86_avx2_psrai_w: + case llvm::Intrinsic::x86_avx2_psrai_d: + case llvm::Intrinsic::x86_avx2_psrl_dq: + case llvm::Intrinsic::x86_sse2_psll_w: + case llvm::Intrinsic::x86_sse2_psll_d: + case llvm::Intrinsic::x86_sse2_psll_q: + case llvm::Intrinsic::x86_sse2_pslli_w: + case llvm::Intrinsic::x86_sse2_pslli_d: + case llvm::Intrinsic::x86_sse2_pslli_q: + case llvm::Intrinsic::x86_sse2_psll_dq: + case llvm::Intrinsic::x86_sse2_psrl_w: + case llvm::Intrinsic::x86_sse2_psrl_d: + case llvm::Intrinsic::x86_sse2_psrl_q: + case llvm::Intrinsic::x86_sse2_psra_w: + case llvm::Intrinsic::x86_sse2_psra_d: + case llvm::Intrinsic::x86_sse2_psrli_w: + case llvm::Intrinsic::x86_sse2_psrli_d: + case llvm::Intrinsic::x86_sse2_psrli_q: + case llvm::Intrinsic::x86_sse2_psrai_w: + case llvm::Intrinsic::x86_sse2_psrai_d: + case llvm::Intrinsic::x86_sse2_psrl_dq: + case llvm::Intrinsic::x86_mmx_psll_w: + case llvm::Intrinsic::x86_mmx_psll_d: + case llvm::Intrinsic::x86_mmx_psll_q: + case llvm::Intrinsic::x86_mmx_pslli_w: + case llvm::Intrinsic::x86_mmx_pslli_d: + case llvm::Intrinsic::x86_mmx_pslli_q: + case llvm::Intrinsic::x86_mmx_psrl_w: + case llvm::Intrinsic::x86_mmx_psrl_d: + case llvm::Intrinsic::x86_mmx_psrl_q: + case llvm::Intrinsic::x86_mmx_psra_w: + case llvm::Intrinsic::x86_mmx_psra_d: + case llvm::Intrinsic::x86_mmx_psrli_w: + case llvm::Intrinsic::x86_mmx_psrli_d: + case llvm::Intrinsic::x86_mmx_psrli_q: + case llvm::Intrinsic::x86_mmx_psrai_w: + case llvm::Intrinsic::x86_mmx_psrai_d: + handleVectorShiftIntrinsic(I, /* Variable */ false); + break; + case llvm::Intrinsic::x86_avx2_psllv_d: + case llvm::Intrinsic::x86_avx2_psllv_d_256: + case llvm::Intrinsic::x86_avx2_psllv_q: + case llvm::Intrinsic::x86_avx2_psllv_q_256: + case llvm::Intrinsic::x86_avx2_psrlv_d: + case llvm::Intrinsic::x86_avx2_psrlv_d_256: + case llvm::Intrinsic::x86_avx2_psrlv_q: + case llvm::Intrinsic::x86_avx2_psrlv_q_256: + case llvm::Intrinsic::x86_avx2_psrav_d: + case llvm::Intrinsic::x86_avx2_psrav_d_256: + handleVectorShiftIntrinsic(I, /* Variable */ true); + break; + + // Byte shifts are not implemented. + // case llvm::Intrinsic::x86_avx512_psll_dq_bs: + // case llvm::Intrinsic::x86_avx512_psrl_dq_bs: + // case llvm::Intrinsic::x86_avx2_psll_dq_bs: + // case llvm::Intrinsic::x86_avx2_psrl_dq_bs: + // case llvm::Intrinsic::x86_sse2_psll_dq_bs: + // case llvm::Intrinsic::x86_sse2_psrl_dq_bs: + + case llvm::Intrinsic::x86_sse2_packsswb_128: + case llvm::Intrinsic::x86_sse2_packssdw_128: + case llvm::Intrinsic::x86_sse2_packuswb_128: + case llvm::Intrinsic::x86_sse41_packusdw: + case llvm::Intrinsic::x86_avx2_packsswb: + case llvm::Intrinsic::x86_avx2_packssdw: + case llvm::Intrinsic::x86_avx2_packuswb: + case llvm::Intrinsic::x86_avx2_packusdw: + handleVectorPackIntrinsic(I); + break; + + case llvm::Intrinsic::x86_mmx_packsswb: + case llvm::Intrinsic::x86_mmx_packuswb: + handleVectorPackIntrinsic(I, 16); + break; + + case llvm::Intrinsic::x86_mmx_packssdw: + handleVectorPackIntrinsic(I, 32); + break; + + case llvm::Intrinsic::x86_mmx_psad_bw: + case llvm::Intrinsic::x86_sse2_psad_bw: + case llvm::Intrinsic::x86_avx2_psad_bw: + handleVectorSadIntrinsic(I); + break; + + case llvm::Intrinsic::x86_sse2_pmadd_wd: + case llvm::Intrinsic::x86_avx2_pmadd_wd: + case llvm::Intrinsic::x86_ssse3_pmadd_ub_sw_128: + case llvm::Intrinsic::x86_avx2_pmadd_ub_sw: + handleVectorPmaddIntrinsic(I); + break; + + case llvm::Intrinsic::x86_ssse3_pmadd_ub_sw: + handleVectorPmaddIntrinsic(I, 8); + break; + + case llvm::Intrinsic::x86_mmx_pmadd_wd: + handleVectorPmaddIntrinsic(I, 16); + break; + default: if (!handleUnknownIntrinsic(I)) visitInstruction(I); @@ -1887,12 +2272,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { return; } - // Allow only tail calls with the same types, otherwise - // we may have a false positive: shadow for a non-void RetVal - // will get propagated to a void RetVal. - if (Call->isTailCall() && Call->getType() != Call->getParent()->getType()) - Call->setTailCall(false); - assert(!isa<IntrinsicInst>(&I) && "intrinsics are handled elsewhere"); // We are going to insert code that relies on the fact that the callee @@ -1926,7 +2305,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { continue; } unsigned Size = 0; - Value *Store = 0; + Value *Store = nullptr; // Compute the Shadow for arg even if it is ByVal, because // in that case getShadow() will copy the actual arg shadow to // __msan_param_tls. @@ -1934,24 +2313,27 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset); DEBUG(dbgs() << " Arg#" << i << ": " << *A << " Shadow: " << *ArgShadow << "\n"); + bool ArgIsInitialized = false; if (CS.paramHasAttr(i + 1, Attribute::ByVal)) { assert(A->getType()->isPointerTy() && "ByVal argument is not a pointer!"); - Size = MS.TD->getTypeAllocSize(A->getType()->getPointerElementType()); + Size = MS.DL->getTypeAllocSize(A->getType()->getPointerElementType()); unsigned Alignment = CS.getParamAlignment(i + 1); Store = IRB.CreateMemCpy(ArgShadowBase, getShadowPtr(A, Type::getInt8Ty(*MS.C), IRB), Size, Alignment); } else { - Size = MS.TD->getTypeAllocSize(A->getType()); + Size = MS.DL->getTypeAllocSize(A->getType()); Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase, kShadowTLSAlignment); + Constant *Cst = dyn_cast<Constant>(ArgShadow); + if (Cst && Cst->isNullValue()) ArgIsInitialized = true; } - if (MS.TrackOrigins) + if (MS.TrackOrigins && !ArgIsInitialized) IRB.CreateStore(getOrigin(A), getOriginPtrForArgument(A, IRB, ArgOffset)); (void)Store; - assert(Size != 0 && Store != 0); + assert(Size != 0 && Store != nullptr); DEBUG(dbgs() << " Param:" << *Store << "\n"); ArgOffset += DataLayout::RoundUpAlignment(Size, 8); } @@ -1966,10 +2348,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // Now, get the shadow for the RetVal. if (!I.getType()->isSized()) return; IRBuilder<> IRBBefore(&I); - // Untill we have full dynamic coverage, make sure the retval shadow is 0. + // Until we have full dynamic coverage, make sure the retval shadow is 0. Value *Base = getShadowPtrForRetval(&I, IRBBefore); IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, kShadowTLSAlignment); - Instruction *NextInsn = 0; + Instruction *NextInsn = nullptr; if (CS.isCall()) { NextInsn = I.getNextNode(); } else { @@ -2015,6 +2397,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void visitPHINode(PHINode &I) { IRBuilder<> IRB(&I); + if (!PropagateShadow) { + setShadow(&I, getCleanShadow(&I)); + return; + } + ShadowPHINodes.push_back(&I); setShadow(&I, IRB.CreatePHI(getShadowTy(&I), I.getNumIncomingValues(), "_msphi_s")); @@ -2026,7 +2413,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void visitAllocaInst(AllocaInst &I) { setShadow(&I, getCleanShadow(&I)); IRBuilder<> IRB(I.getNextNode()); - uint64_t Size = MS.TD->getTypeAllocSize(I.getAllocatedType()); + uint64_t Size = MS.DL->getTypeAllocSize(I.getAllocatedType()); if (PoisonStack && ClPoisonStackWithCall) { IRB.CreateCall2(MS.MsanPoisonStackFn, IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), @@ -2062,33 +2449,51 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void visitSelectInst(SelectInst& I) { IRBuilder<> IRB(&I); // a = select b, c, d - Value *S = IRB.CreateSelect(I.getCondition(), getShadow(I.getTrueValue()), - getShadow(I.getFalseValue())); + Value *B = I.getCondition(); + Value *C = I.getTrueValue(); + Value *D = I.getFalseValue(); + Value *Sb = getShadow(B); + Value *Sc = getShadow(C); + Value *Sd = getShadow(D); + + // Result shadow if condition shadow is 0. + Value *Sa0 = IRB.CreateSelect(B, Sc, Sd); + Value *Sa1; if (I.getType()->isAggregateType()) { // To avoid "sign extending" i1 to an arbitrary aggregate type, we just do // an extra "select". This results in much more compact IR. // Sa = select Sb, poisoned, (select b, Sc, Sd) - S = IRB.CreateSelect(getShadow(I.getCondition()), - getPoisonedShadow(getShadowTy(I.getType())), S, - "_msprop_select_agg"); + Sa1 = getPoisonedShadow(getShadowTy(I.getType())); } else { - // Sa = (sext Sb) | (select b, Sc, Sd) - S = IRB.CreateOr(S, CreateShadowCast(IRB, getShadow(I.getCondition()), - S->getType(), true), - "_msprop_select"); + // Sa = select Sb, [ (c^d) | Sc | Sd ], [ b ? Sc : Sd ] + // If Sb (condition is poisoned), look for bits in c and d that are equal + // and both unpoisoned. + // If !Sb (condition is unpoisoned), simply pick one of Sc and Sd. + + // Cast arguments to shadow-compatible type. + C = CreateAppToShadowCast(IRB, C); + D = CreateAppToShadowCast(IRB, D); + + // Result shadow if condition shadow is 1. + Sa1 = IRB.CreateOr(IRB.CreateXor(C, D), IRB.CreateOr(Sc, Sd)); } - setShadow(&I, S); + Value *Sa = IRB.CreateSelect(Sb, Sa1, Sa0, "_msprop_select"); + setShadow(&I, Sa); if (MS.TrackOrigins) { // Origins are always i32, so any vector conditions must be flattened. // FIXME: consider tracking vector origins for app vectors? - Value *Cond = I.getCondition(); - if (Cond->getType()->isVectorTy()) { - Value *ConvertedShadow = convertToShadowTyNoVec(Cond, IRB); - Cond = IRB.CreateICmpNE(ConvertedShadow, - getCleanShadow(ConvertedShadow), "_mso_select"); + if (B->getType()->isVectorTy()) { + Type *FlatTy = getShadowTyNoVec(B->getType()); + B = IRB.CreateICmpNE(IRB.CreateBitCast(B, FlatTy), + ConstantInt::getNullValue(FlatTy)); + Sb = IRB.CreateICmpNE(IRB.CreateBitCast(Sb, FlatTy), + ConstantInt::getNullValue(FlatTy)); } - setOrigin(&I, IRB.CreateSelect(Cond, - getOrigin(I.getTrueValue()), getOrigin(I.getFalseValue()))); + // a = select b, c, d + // Oa = Sb ? Ob : (b ? Oc : Od) + setOrigin(&I, IRB.CreateSelect( + Sb, getOrigin(I.getCondition()), + IRB.CreateSelect(B, getOrigin(C), getOrigin(D)))); } } @@ -2171,7 +2576,8 @@ struct VarArgAMD64Helper : public VarArgHelper { VarArgAMD64Helper(Function &F, MemorySanitizer &MS, MemorySanitizerVisitor &MSV) - : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(0), VAArgOverflowSize(0) { } + : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(nullptr), + VAArgOverflowSize(nullptr) {} enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory }; @@ -2195,34 +2601,47 @@ struct VarArgAMD64Helper : public VarArgHelper { // would have been to associate each live instance of va_list with a copy of // MSanParamTLS, and extract shadow on va_arg() call in the argument list // order. - void visitCallSite(CallSite &CS, IRBuilder<> &IRB) { + void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override { unsigned GpOffset = 0; unsigned FpOffset = AMD64GpEndOffset; unsigned OverflowOffset = AMD64FpEndOffset; for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end(); ArgIt != End; ++ArgIt) { Value *A = *ArgIt; - ArgKind AK = classifyArgument(A); - if (AK == AK_GeneralPurpose && GpOffset >= AMD64GpEndOffset) - AK = AK_Memory; - if (AK == AK_FloatingPoint && FpOffset >= AMD64FpEndOffset) - AK = AK_Memory; - Value *Base; - switch (AK) { - case AK_GeneralPurpose: - Base = getShadowPtrForVAArgument(A, IRB, GpOffset); - GpOffset += 8; - break; - case AK_FloatingPoint: - Base = getShadowPtrForVAArgument(A, IRB, FpOffset); - FpOffset += 16; - break; - case AK_Memory: - uint64_t ArgSize = MS.TD->getTypeAllocSize(A->getType()); - Base = getShadowPtrForVAArgument(A, IRB, OverflowOffset); + unsigned ArgNo = CS.getArgumentNo(ArgIt); + bool IsByVal = CS.paramHasAttr(ArgNo + 1, Attribute::ByVal); + if (IsByVal) { + // ByVal arguments always go to the overflow area. + assert(A->getType()->isPointerTy()); + Type *RealTy = A->getType()->getPointerElementType(); + uint64_t ArgSize = MS.DL->getTypeAllocSize(RealTy); + Value *Base = getShadowPtrForVAArgument(RealTy, IRB, OverflowOffset); OverflowOffset += DataLayout::RoundUpAlignment(ArgSize, 8); + IRB.CreateMemCpy(Base, MSV.getShadowPtr(A, IRB.getInt8Ty(), IRB), + ArgSize, kShadowTLSAlignment); + } else { + ArgKind AK = classifyArgument(A); + if (AK == AK_GeneralPurpose && GpOffset >= AMD64GpEndOffset) + AK = AK_Memory; + if (AK == AK_FloatingPoint && FpOffset >= AMD64FpEndOffset) + AK = AK_Memory; + Value *Base; + switch (AK) { + case AK_GeneralPurpose: + Base = getShadowPtrForVAArgument(A->getType(), IRB, GpOffset); + GpOffset += 8; + break; + case AK_FloatingPoint: + Base = getShadowPtrForVAArgument(A->getType(), IRB, FpOffset); + FpOffset += 16; + break; + case AK_Memory: + uint64_t ArgSize = MS.DL->getTypeAllocSize(A->getType()); + Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset); + OverflowOffset += DataLayout::RoundUpAlignment(ArgSize, 8); + } + IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); } - IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); } Constant *OverflowSize = ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AMD64FpEndOffset); @@ -2230,15 +2649,15 @@ struct VarArgAMD64Helper : public VarArgHelper { } /// \brief Compute the shadow address for a given va_arg. - Value *getShadowPtrForVAArgument(Value *A, IRBuilder<> &IRB, + Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB, int ArgOffset) { Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy); Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); - return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(A), 0), + return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0), "_msarg"); } - void visitVAStartInst(VAStartInst &I) { + void visitVAStartInst(VAStartInst &I) override { IRBuilder<> IRB(&I); VAStartInstrumentationList.push_back(&I); Value *VAListTag = I.getArgOperand(0); @@ -2250,7 +2669,7 @@ struct VarArgAMD64Helper : public VarArgHelper { /* size */24, /* alignment */8, false); } - void visitVACopyInst(VACopyInst &I) { + void visitVACopyInst(VACopyInst &I) override { IRBuilder<> IRB(&I); Value *VAListTag = I.getArgOperand(0); Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB); @@ -2261,7 +2680,7 @@ struct VarArgAMD64Helper : public VarArgHelper { /* size */24, /* alignment */8, false); } - void finalizeInstrumentation() { + void finalizeInstrumentation() override { assert(!VAArgOverflowSize && !VAArgTLSCopy && "finalizeInstrumentation called twice"); if (!VAStartInstrumentationList.empty()) { @@ -2313,13 +2732,13 @@ struct VarArgNoOpHelper : public VarArgHelper { VarArgNoOpHelper(Function &F, MemorySanitizer &MS, MemorySanitizerVisitor &MSV) {} - void visitCallSite(CallSite &CS, IRBuilder<> &IRB) {} + void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {} - void visitVAStartInst(VAStartInst &I) {} + void visitVAStartInst(VAStartInst &I) override {} - void visitVACopyInst(VACopyInst &I) {} + void visitVACopyInst(VACopyInst &I) override {} - void finalizeInstrumentation() {} + void finalizeInstrumentation() override {} }; VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan, diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 89fb746..89386a6 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -19,8 +19,6 @@ // The rest is handled by the run-time library. //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "tsan" - #include "llvm/Transforms/Instrumentation.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" @@ -42,12 +40,11 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ModuleUtils.h" -#include "llvm/Transforms/Utils/SpecialCaseList.h" using namespace llvm; -static cl::opt<std::string> ClBlacklistFile("tsan-blacklist", - cl::desc("Blacklist file"), cl::Hidden); +#define DEBUG_TYPE "tsan" + static cl::opt<bool> ClInstrumentMemoryAccesses( "tsan-instrument-memory-accesses", cl::init(true), cl::desc("Instrument memory accesses"), cl::Hidden); @@ -76,14 +73,10 @@ namespace { /// ThreadSanitizer: instrument the code in module to find races. struct ThreadSanitizer : public FunctionPass { - ThreadSanitizer(StringRef BlacklistFile = StringRef()) - : FunctionPass(ID), - TD(0), - BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile - : BlacklistFile) { } - const char *getPassName() const; - bool runOnFunction(Function &F); - bool doInitialization(Module &M); + ThreadSanitizer() : FunctionPass(ID), DL(nullptr) {} + const char *getPassName() const override; + bool runOnFunction(Function &F) override; + bool doInitialization(Module &M) override; static char ID; // Pass identification, replacement for typeid. private: @@ -96,10 +89,8 @@ struct ThreadSanitizer : public FunctionPass { bool addrPointsToConstantData(Value *Addr); int getMemoryAccessFuncIndex(Value *Addr); - DataLayout *TD; + const DataLayout *DL; Type *IntptrTy; - SmallString<64> BlacklistFile; - OwningPtr<SpecialCaseList> BL; IntegerType *OrdTy; // Callbacks to run-time library are computed in doInitialization. Function *TsanFuncEntry; @@ -129,8 +120,8 @@ const char *ThreadSanitizer::getPassName() const { return "ThreadSanitizer"; } -FunctionPass *llvm::createThreadSanitizerPass(StringRef BlacklistFile) { - return new ThreadSanitizer(BlacklistFile); +FunctionPass *llvm::createThreadSanitizerPass() { + return new ThreadSanitizer(); } static Function *checkInterfaceFunction(Constant *FuncOrBitcast) { @@ -174,8 +165,8 @@ void ThreadSanitizer::initializeCallbacks(Module &M) { for (int op = AtomicRMWInst::FIRST_BINOP; op <= AtomicRMWInst::LAST_BINOP; ++op) { - TsanAtomicRMW[op][i] = NULL; - const char *NamePart = NULL; + TsanAtomicRMW[op][i] = nullptr; + const char *NamePart = nullptr; if (op == AtomicRMWInst::Xchg) NamePart = "_exchange"; else if (op == AtomicRMWInst::Add) @@ -224,14 +215,14 @@ void ThreadSanitizer::initializeCallbacks(Module &M) { } bool ThreadSanitizer::doInitialization(Module &M) { - TD = getAnalysisIfAvailable<DataLayout>(); - if (!TD) - return false; - BL.reset(SpecialCaseList::createOrDie(BlacklistFile)); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + if (!DLP) + report_fatal_error("data layout missing"); + DL = &DLP->getDataLayout(); // Always insert a call to __tsan_init into the module's CTORs. IRBuilder<> IRB(M.getContext()); - IntptrTy = IRB.getIntPtrTy(TD); + IntptrTy = IRB.getIntPtrTy(DL); Value *TsanInit = M.getOrInsertFunction("__tsan_init", IRB.getVoidTy(), NULL); appendToGlobalCtors(M, cast<Function>(TsanInit), 0); @@ -320,8 +311,7 @@ static bool isAtomic(Instruction *I) { } bool ThreadSanitizer::runOnFunction(Function &F) { - if (!TD) return false; - if (BL->isIn(F)) return false; + if (!DL) return false; initializeCallbacks(*F.getParent()); SmallVector<Instruction*, 8> RetVec; SmallVector<Instruction*, 8> AllLoadsAndStores; @@ -330,22 +320,20 @@ bool ThreadSanitizer::runOnFunction(Function &F) { SmallVector<Instruction*, 8> MemIntrinCalls; bool Res = false; bool HasCalls = false; + bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeThread); // Traverse all instructions, collect loads/stores/returns, check for calls. - for (Function::iterator FI = F.begin(), FE = F.end(); - FI != FE; ++FI) { - BasicBlock &BB = *FI; - for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); - BI != BE; ++BI) { - if (isAtomic(BI)) - AtomicAccesses.push_back(BI); - else if (isa<LoadInst>(BI) || isa<StoreInst>(BI)) - LocalLoadsAndStores.push_back(BI); - else if (isa<ReturnInst>(BI)) - RetVec.push_back(BI); - else if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) { - if (isa<MemIntrinsic>(BI)) - MemIntrinCalls.push_back(BI); + for (auto &BB : F) { + for (auto &Inst : BB) { + if (isAtomic(&Inst)) + AtomicAccesses.push_back(&Inst); + else if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst)) + LocalLoadsAndStores.push_back(&Inst); + else if (isa<ReturnInst>(Inst)) + RetVec.push_back(&Inst); + else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) { + if (isa<MemIntrinsic>(Inst)) + MemIntrinCalls.push_back(&Inst); HasCalls = true; chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores); } @@ -357,21 +345,22 @@ bool ThreadSanitizer::runOnFunction(Function &F) { // FIXME: many of these accesses do not need to be checked for races // (e.g. variables that do not escape, etc). - // Instrument memory accesses. - if (ClInstrumentMemoryAccesses && F.hasFnAttribute(Attribute::SanitizeThread)) - for (size_t i = 0, n = AllLoadsAndStores.size(); i < n; ++i) { - Res |= instrumentLoadOrStore(AllLoadsAndStores[i]); + // Instrument memory accesses only if we want to report bugs in the function. + if (ClInstrumentMemoryAccesses && SanitizeFunction) + for (auto Inst : AllLoadsAndStores) { + Res |= instrumentLoadOrStore(Inst); } - // Instrument atomic memory accesses. + // Instrument atomic memory accesses in any case (they can be used to + // implement synchronization). if (ClInstrumentAtomics) - for (size_t i = 0, n = AtomicAccesses.size(); i < n; ++i) { - Res |= instrumentAtomic(AtomicAccesses[i]); + for (auto Inst : AtomicAccesses) { + Res |= instrumentAtomic(Inst); } - if (ClInstrumentMemIntrinsics) - for (size_t i = 0, n = MemIntrinCalls.size(); i < n; ++i) { - Res |= instrumentMemIntrinsic(MemIntrinCalls[i]); + if (ClInstrumentMemIntrinsics && SanitizeFunction) + for (auto Inst : MemIntrinCalls) { + Res |= instrumentMemIntrinsic(Inst); } // Instrument function entry/exit points if there were instrumented accesses. @@ -381,8 +370,8 @@ bool ThreadSanitizer::runOnFunction(Function &F) { Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress), IRB.getInt32(0)); IRB.CreateCall(TsanFuncEntry, ReturnAddress); - for (size_t i = 0, n = RetVec.size(); i < n; ++i) { - IRBuilder<> IRBRet(RetVec[i]); + for (auto RetInst : RetVec) { + IRBuilder<> IRBRet(RetInst); IRBRet.CreateCall(TsanFuncExit); } Res = true; @@ -402,8 +391,13 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) { if (IsWrite && isVtableAccess(I)) { DEBUG(dbgs() << " VPTR : " << *I << "\n"); Value *StoredValue = cast<StoreInst>(I)->getValueOperand(); - // StoredValue does not necessary have a pointer type. - if (isa<IntegerType>(StoredValue->getType())) + // StoredValue may be a vector type if we are storing several vptrs at once. + // In this case, just take the first element of the vector since this is + // enough to find vptr races. + if (isa<VectorType>(StoredValue->getType())) + StoredValue = IRB.CreateExtractElement( + StoredValue, ConstantInt::get(IRB.getInt32Ty(), 0)); + if (StoredValue->getType()->isIntegerTy()) StoredValue = IRB.CreateIntToPtr(StoredValue, IRB.getInt8PtrTy()); // Call TsanVptrUpdate. IRB.CreateCall2(TsanVptrUpdate, @@ -440,21 +434,6 @@ static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) { return IRB->getInt32(v); } -static ConstantInt *createFailOrdering(IRBuilder<> *IRB, AtomicOrdering ord) { - uint32_t v = 0; - switch (ord) { - case NotAtomic: assert(false); - case Unordered: // Fall-through. - case Monotonic: v = 0; break; - // case Consume: v = 1; break; // Not specified yet. - case Acquire: v = 2; break; - case Release: v = 0; break; - case AcquireRelease: v = 2; break; - case SequentiallyConsistent: v = 5; break; - } - return IRB->getInt32(v); -} - // If a memset intrinsic gets inlined by the code gen, we will miss races on it. // So, we either need to ensure the intrinsic is not inlined, or instrument it. // We do not instrument memset/memmove/memcpy intrinsics (too complicated), @@ -482,7 +461,7 @@ bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) { } // Both llvm and ThreadSanitizer atomic operations are based on C++11/C1x -// standards. For background see C++11 standard. A slightly older, publically +// standards. For background see C++11 standard. A slightly older, publicly // available draft of the standard (not entirely up-to-date, but close enough // for casual browsing) is available here: // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2011/n3242.pdf @@ -527,7 +506,7 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) { if (Idx < 0) return false; Function *F = TsanAtomicRMW[RMWI->getOperation()][Idx]; - if (F == NULL) + if (!F) return false; const size_t ByteSize = 1 << Idx; const size_t BitSize = ByteSize * 8; @@ -550,10 +529,16 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) { Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), IRB.CreateIntCast(CASI->getCompareOperand(), Ty, false), IRB.CreateIntCast(CASI->getNewValOperand(), Ty, false), - createOrdering(&IRB, CASI->getOrdering()), - createFailOrdering(&IRB, CASI->getOrdering())}; - CallInst *C = CallInst::Create(TsanAtomicCAS[Idx], ArrayRef<Value*>(Args)); - ReplaceInstWithInst(I, C); + createOrdering(&IRB, CASI->getSuccessOrdering()), + createOrdering(&IRB, CASI->getFailureOrdering())}; + CallInst *C = IRB.CreateCall(TsanAtomicCAS[Idx], Args); + Value *Success = IRB.CreateICmpEQ(C, CASI->getCompareOperand()); + + Value *Res = IRB.CreateInsertValue(UndefValue::get(CASI->getType()), C, 0); + Res = IRB.CreateInsertValue(Res, Success, 1); + + I->replaceAllUsesWith(Res); + I->eraseFromParent(); } else if (FenceInst *FI = dyn_cast<FenceInst>(I)) { Value *Args[] = {createOrdering(&IRB, FI->getOrdering())}; Function *F = FI->getSynchScope() == SingleThread ? @@ -568,7 +553,7 @@ int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr) { Type *OrigPtrTy = Addr->getType(); Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType(); assert(OrigTy->isSized()); - uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy); + uint32_t TypeSize = DL->getTypeStoreSizeInBits(OrigTy); if (TypeSize != 8 && TypeSize != 16 && TypeSize != 32 && TypeSize != 64 && TypeSize != 128) { NumAccessesWithBadSize++; diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h index 4eac39d..4098428 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h @@ -43,34 +43,34 @@ public: EPT_RetainAutoreleaseRV }; - ARCRuntimeEntryPoints() : TheModule(0), - AutoreleaseRV(0), - Release(0), - Retain(0), - RetainBlock(0), - Autorelease(0), - StoreStrong(0), - RetainRV(0), - RetainAutorelease(0), - RetainAutoreleaseRV(0) { } + ARCRuntimeEntryPoints() : TheModule(nullptr), + AutoreleaseRV(nullptr), + Release(nullptr), + Retain(nullptr), + RetainBlock(nullptr), + Autorelease(nullptr), + StoreStrong(nullptr), + RetainRV(nullptr), + RetainAutorelease(nullptr), + RetainAutoreleaseRV(nullptr) { } ~ARCRuntimeEntryPoints() { } void Initialize(Module *M) { TheModule = M; - AutoreleaseRV = 0; - Release = 0; - Retain = 0; - RetainBlock = 0; - Autorelease = 0; - StoreStrong = 0; - RetainRV = 0; - RetainAutorelease = 0; - RetainAutoreleaseRV = 0; + AutoreleaseRV = nullptr; + Release = nullptr; + Retain = nullptr; + RetainBlock = nullptr; + Autorelease = nullptr; + StoreStrong = nullptr; + RetainRV = nullptr; + RetainAutorelease = nullptr; + RetainAutoreleaseRV = nullptr; } Constant *get(const EntryPointType entry) { - assert(TheModule != 0 && "Not initialized."); + assert(TheModule != nullptr && "Not initialized."); switch (entry) { case EPT_AutoreleaseRV: diff --git a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp index 8f917ae..08c8842 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp @@ -20,15 +20,16 @@ /// //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "objc-arc-dependency" #include "ObjCARC.h" #include "DependencyAnalysis.h" #include "ProvenanceAnalysis.h" -#include "llvm/Support/CFG.h" +#include "llvm/IR/CFG.h" using namespace llvm; using namespace llvm::objcarc; +#define DEBUG_TYPE "objc-arc-dependency" + /// Test whether the given instruction can result in a reference count /// modification (positive or negative) for the pointer's object. bool @@ -223,7 +224,7 @@ llvm::objcarc::FindDependencies(DependenceKind Flavor, pred_iterator PI(LocalStartBB), PE(LocalStartBB, false); if (PI == PE) // If we've reached the function entry, produce a null dependence. - DependingInsts.insert(0); + DependingInsts.insert(nullptr); else // Add the predecessors to the worklist. do { diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h index 8044494..f71cf2b 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h @@ -27,10 +27,10 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" -#include "llvm/Support/CallSite.h" -#include "llvm/Support/InstIterator.h" #include "llvm/Transforms/ObjCARC.h" #include "llvm/Transforms/Utils/Local.h" @@ -308,6 +308,7 @@ static inline bool IsPotentialRetainableObjPtr(const Value *Op) { // Special arguments can not be a valid retainable object pointer. if (const Argument *Arg = dyn_cast<Argument>(Op)) if (Arg->hasByValAttr() || + Arg->hasInAllocaAttr() || Arg->hasNestAttr() || Arg->hasStructRetAttr()) return false; diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp index 00d9864..1a25391 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp @@ -24,7 +24,6 @@ /// //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "objc-arc-ap-elim" #include "ObjCARC.h" #include "llvm/ADT/STLExtras.h" #include "llvm/IR/Constants.h" @@ -34,11 +33,13 @@ using namespace llvm; using namespace llvm::objcarc; +#define DEBUG_TYPE "objc-arc-ap-elim" + namespace { /// \brief Autorelease pool elimination. class ObjCARCAPElim : public ModulePass { - virtual void getAnalysisUsage(AnalysisUsage &AU) const; - virtual bool runOnModule(Module &M); + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnModule(Module &M) override; static bool MayAutorelease(ImmutableCallSite CS, unsigned Depth = 0); static bool OptimizeBB(BasicBlock *BB); @@ -93,7 +94,7 @@ bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) { bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) { bool Changed = false; - Instruction *Push = 0; + Instruction *Push = nullptr; for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { Instruction *Inst = I++; switch (GetBasicInstructionClass(Inst)) { @@ -112,11 +113,11 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) { Inst->eraseFromParent(); Push->eraseFromParent(); } - Push = 0; + Push = nullptr; break; case IC_CallOrUser: if (MayAutorelease(ImmutableCallSite(Inst))) - Push = 0; + Push = nullptr; break; default: break; @@ -154,8 +155,8 @@ bool ObjCARCAPElim::runOnModule(Module &M) { for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end(); OI != OE; ++OI) { Value *Op = *OI; - // llvm.global_ctors is an array of pairs where the second members - // are constructor functions. + // llvm.global_ctors is an array of three-field structs where the second + // members are constructor functions. Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1)); // If the user used a constructor function with the wrong signature and // it got bitcasted or whatever, look the other way. @@ -165,7 +166,7 @@ bool ObjCARCAPElim::runOnModule(Module &M) { if (F->isDeclaration()) continue; // Only look at functions with one basic block. - if (llvm::next(F->begin()) != F->end()) + if (std::next(F->begin()) != F->end()) continue; // Ok, a single-block constructor function definition. Try to optimize it. Changed |= OptimizeBB(F->begin()); diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp index d18667b..2c09e70 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp @@ -20,7 +20,6 @@ /// //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "objc-arc-aa" #include "ObjCARC.h" #include "ObjCARCAliasAnalysis.h" #include "llvm/IR/Instruction.h" @@ -28,6 +27,8 @@ #include "llvm/PassAnalysisSupport.h" #include "llvm/PassSupport.h" +#define DEBUG_TYPE "objc-arc-aa" + namespace llvm { class Function; class Value; diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h index 41ccfe2..97b565b 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h @@ -44,28 +44,28 @@ namespace objcarc { } private: - virtual void initializePass() { + void initializePass() override { InitializeAliasAnalysis(this); } /// This method is used when a pass implements an analysis interface through /// multiple inheritance. If needed, it should override this to adjust the /// this pointer as needed for the specified pass info. - virtual void *getAdjustedAnalysisPointer(const void *PI) { + void *getAdjustedAnalysisPointer(const void *PI) override { if (PI == &AliasAnalysis::ID) return static_cast<AliasAnalysis *>(this); return this; } - virtual void getAnalysisUsage(AnalysisUsage &AU) const; - virtual AliasResult alias(const Location &LocA, const Location &LocB); - virtual bool pointsToConstantMemory(const Location &Loc, bool OrLocal); - virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS); - virtual ModRefBehavior getModRefBehavior(const Function *F); - virtual ModRefResult getModRefInfo(ImmutableCallSite CS, - const Location &Loc); - virtual ModRefResult getModRefInfo(ImmutableCallSite CS1, - ImmutableCallSite CS2); + void getAnalysisUsage(AnalysisUsage &AU) const override; + AliasResult alias(const Location &LocA, const Location &LocB) override; + bool pointsToConstantMemory(const Location &Loc, bool OrLocal) override; + ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override; + ModRefBehavior getModRefBehavior(const Function *F) override; + ModRefResult getModRefInfo(ImmutableCallSite CS, + const Location &Loc) override; + ModRefResult getModRefInfo(ImmutableCallSite CS1, + ImmutableCallSite CS2) override; }; } // namespace objcarc diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp index 9d80037..f48d53d 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -26,13 +26,12 @@ // TODO: ObjCARCContract could insert PHI nodes when uses aren't // dominated by single calls. -#define DEBUG_TYPE "objc-arc-contract" #include "ObjCARC.h" #include "ARCRuntimeEntryPoints.h" #include "DependencyAnalysis.h" #include "ProvenanceAnalysis.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Operator.h" #include "llvm/Support/Debug.h" @@ -40,6 +39,8 @@ using namespace llvm; using namespace llvm::objcarc; +#define DEBUG_TYPE "objc-arc-contract" + STATISTIC(NumPeeps, "Number of calls peephole-optimized"); STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed"); @@ -79,9 +80,9 @@ namespace { void ContractRelease(Instruction *Release, inst_iterator &Iter); - virtual void getAnalysisUsage(AnalysisUsage &AU) const; - virtual bool doInitialization(Module &M); - virtual bool runOnFunction(Function &F); + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; public: static char ID; @@ -95,7 +96,7 @@ char ObjCARCContract::ID = 0; INITIALIZE_PASS_BEGIN(ObjCARCContract, "objc-arc-contract", "ObjC ARC contraction", false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(ObjCARCContract, "objc-arc-contract", "ObjC ARC contraction", false, false) @@ -105,7 +106,7 @@ Pass *llvm::createObjCARCContractPass() { void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<AliasAnalysis>(); - AU.addRequired<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); } @@ -157,7 +158,7 @@ ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease, // Check that there are no instructions between the retain and the autorelease // (such as an autorelease_pop) which may change the count. - CallInst *Retain = 0; + CallInst *Retain = nullptr; if (Class == IC_AutoreleaseRV) FindDependencies(RetainAutoreleaseRVDep, Arg, Autorelease->getParent(), Autorelease, @@ -218,7 +219,7 @@ void ObjCARCContract::ContractRelease(Instruction *Release, BasicBlock::iterator I = Load, End = BB->end(); ++I; AliasAnalysis::Location Loc = AA->getLocation(Load); - StoreInst *Store = 0; + StoreInst *Store = nullptr; bool SawRelease = false; for (; !Store || !SawRelease; ++I) { if (I == End) @@ -300,7 +301,7 @@ bool ObjCARCContract::doInitialization(Module &M) { EP.Initialize(&M); // Initialize RetainRVMarker. - RetainRVMarker = 0; + RetainRVMarker = nullptr; if (NamedMDNode *NMD = M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker")) if (NMD->getNumOperands() == 1) { @@ -323,7 +324,7 @@ bool ObjCARCContract::runOnFunction(Function &F) { Changed = false; AA = &getAnalysis<AliasAnalysis>(); - DT = &getAnalysis<DominatorTree>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); PA.setAA(&getAnalysis<AliasAnalysis>()); @@ -440,17 +441,17 @@ bool ObjCARCContract::runOnFunction(Function &F) { // Don't use GetObjCArg because we don't want to look through bitcasts // and such; to do the replacement, the argument must have type i8*. - const Value *Arg = cast<CallInst>(Inst)->getArgOperand(0); + Value *Arg = cast<CallInst>(Inst)->getArgOperand(0); for (;;) { // If we're compiling bugpointed code, don't get in trouble. if (!isa<Instruction>(Arg) && !isa<Argument>(Arg)) break; // Look through the uses of the pointer. - for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end(); + for (Value::use_iterator UI = Arg->use_begin(), UE = Arg->use_end(); UI != UE; ) { - Use &U = UI.getUse(); - unsigned OperandNo = UI.getOperandNo(); - ++UI; // Increment UI now, because we may unlink its element. + // Increment UI now, because we may unlink its element. + Use &U = *UI++; + unsigned OperandNo = U.getOperandNo(); // If the call's return value dominates a use of the call's argument // value, rewrite the use to use the return value. We check for @@ -475,9 +476,9 @@ bool ObjCARCContract::runOnFunction(Function &F) { for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) if (PHI->getIncomingBlock(i) == BB) { // Keep the UI iterator valid. - if (&PHI->getOperandUse( - PHINode::getOperandNumForIncomingValue(i)) == - &UI.getUse()) + if (UI != UE && + &PHI->getOperandUse( + PHINode::getOperandNumForIncomingValue(i)) == &*UI) ++UI; PHI->setIncomingValue(i, Replacement); } diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp index 39bf8f3..bf9fcbb 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp @@ -23,11 +23,10 @@ /// //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "objc-arc-expand" - #include "ObjCARC.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Value.h" @@ -37,9 +36,10 @@ #include "llvm/PassSupport.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/InstIterator.h" #include "llvm/Support/raw_ostream.h" +#define DEBUG_TYPE "objc-arc-expand" + namespace llvm { class Module; } @@ -50,9 +50,9 @@ using namespace llvm::objcarc; namespace { /// \brief Early ARC transformations. class ObjCARCExpand : public FunctionPass { - virtual void getAnalysisUsage(AnalysisUsage &AU) const; - virtual bool doInitialization(Module &M); - virtual bool runOnFunction(Function &F); + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; /// A flag indicating whether this optimization pass should run. bool Run; diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 2976df6..dd4dd50 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -24,7 +24,6 @@ /// //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "objc-arc-opts" #include "ObjCARC.h" #include "ARCRuntimeEntryPoints.h" #include "DependencyAnalysis.h" @@ -35,15 +34,17 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; using namespace llvm::objcarc; +#define DEBUG_TYPE "objc-arc-opts" + /// \defgroup MiscUtils Miscellaneous utilities that are not ARC specific. /// @{ @@ -156,24 +157,21 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) { return FindSingleUseIdentifiedObject( cast<CallInst>(Arg)->getArgOperand(0)); if (!IsObjCIdentifiedObject(Arg)) - return 0; + return nullptr; return Arg; } // If we found an identifiable object but it has multiple uses, but they are // trivial uses, we can still consider this to be a single-use value. if (IsObjCIdentifiedObject(Arg)) { - for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end(); - UI != UE; ++UI) { - const User *U = *UI; + for (const User *U : Arg->users()) if (!U->use_empty() || StripPointerCastsAndObjCCalls(U) != Arg) - return 0; - } + return nullptr; return Arg; } - return 0; + return nullptr; } /// This is a wrapper around getUnderlyingObjCPtr along the lines of @@ -376,13 +374,13 @@ namespace { bool CFGHazardAfflicted; RRInfo() : - KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(0), + KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(nullptr), CFGHazardAfflicted(false) {} void clear(); /// Conservatively merge the two RRInfo. Returns true if a partial merge has - /// occured, false otherwise. + /// occurred, false otherwise. bool Merge(const RRInfo &Other); }; @@ -391,7 +389,7 @@ namespace { void RRInfo::clear() { KnownSafe = false; IsTailCallRelease = false; - ReleaseMetadata = 0; + ReleaseMetadata = nullptr; Calls.clear(); ReverseInsertPts.clear(); CFGHazardAfflicted = false; @@ -400,7 +398,7 @@ void RRInfo::clear() { bool RRInfo::Merge(const RRInfo &Other) { // Conservatively merge the ReleaseMetadata information. if (ReleaseMetadata != Other.ReleaseMetadata) - ReleaseMetadata = 0; + ReleaseMetadata = nullptr; // Conservatively merge the boolean state. KnownSafe &= Other.KnownSafe; @@ -459,7 +457,7 @@ namespace { } bool IsTrackingImpreciseReleases() const { - return RRI.ReleaseMetadata != 0; + return RRI.ReleaseMetadata != nullptr; } const MDNode *GetReleaseMetadata() const { @@ -538,8 +536,7 @@ namespace { void PtrState::Merge(const PtrState &Other, bool TopDown) { - Seq = MergeSeqs(static_cast<Sequence>(Seq), static_cast<Sequence>(Other.Seq), - TopDown); + Seq = MergeSeqs(GetSeq(), Other.GetSeq(), TopDown); KnownPositiveRefCount &= Other.KnownPositiveRefCount; // If we're not in a sequence (anymore), drop all associated state. @@ -660,7 +657,7 @@ namespace { /// which pass through this block. This is only valid after both the /// top-down and bottom-up traversals are complete. /// - /// Returns true if overflow occured. Returns false if overflow did not + /// Returns true if overflow occurred. Returns false if overflow did not /// occur. bool GetAllPathCountWithOverflow(unsigned &PathCount) const { if (TopDownPathCount == OverflowOccurredValue || @@ -668,7 +665,7 @@ namespace { return true; unsigned long long Product = (unsigned long long)TopDownPathCount*BottomUpPathCount; - // Overflow occured if any of the upper bits of Product are set or if all + // Overflow occurred if any of the upper bits of Product are set or if all // the lower bits of Product are all set. return (Product >> 32) || ((PathCount = Product) == OverflowOccurredValue); @@ -712,7 +709,7 @@ void BBState::MergePred(const BBState &Other) { // In order to be consistent, we clear the top down pointers when by adding // TopDownPathCount becomes OverflowOccurredValue even though "true" overflow - // has not occured. + // has not occurred. if (TopDownPathCount == OverflowOccurredValue) { clearTopDownPointers(); return; @@ -756,7 +753,7 @@ void BBState::MergeSucc(const BBState &Other) { // In order to be consistent, we clear the top down pointers when by adding // BottomUpPathCount becomes OverflowOccurredValue even though "true" overflow - // has not occured. + // has not occurred. if (BottomUpPathCount == OverflowOccurredValue) { clearBottomUpPointers(); return; @@ -822,7 +819,7 @@ ARCAnnotationTargetIdentifier("objc-arc-annotation-target-identifier", /// arc annotation processor tool. If the function is an static MDString *AppendMDNodeToSourcePtr(unsigned NodeId, Value *Ptr) { - MDString *Hash = 0; + MDString *Hash = nullptr; // If pointer is a result of an instruction and it does not have a source // MDNode it, attach a new MDNode onto it. If pointer is a result of @@ -884,7 +881,7 @@ static void AppendMDNodeToInstForPtr(unsigned NodeId, MDString *PtrSourceMDNodeID, Sequence OldSeq, Sequence NewSeq) { - MDNode *Node = 0; + MDNode *Node = nullptr; Value *tmp[3] = {PtrSourceMDNodeID, SequenceToMDString(Inst->getContext(), OldSeq), @@ -920,7 +917,7 @@ static void GenerateARCBBEntranceAnnotation(const char *Name, BasicBlock *BB, Value *PtrName; StringRef Tmp = Ptr->getName(); - if (0 == (PtrName = M->getGlobalVariable(Tmp, true))) { + if (nullptr == (PtrName = M->getGlobalVariable(Tmp, true))) { Value *ActualPtrName = Builder.CreateGlobalStringPtr(Tmp, Tmp + "_STR"); PtrName = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage, @@ -929,7 +926,7 @@ static void GenerateARCBBEntranceAnnotation(const char *Name, BasicBlock *BB, Value *S; std::string SeqStr = SequenceToString(Seq); - if (0 == (S = M->getGlobalVariable(SeqStr, true))) { + if (nullptr == (S = M->getGlobalVariable(SeqStr, true))) { Value *ActualPtrName = Builder.CreateGlobalStringPtr(SeqStr, SeqStr + "_STR"); S = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage, @@ -959,11 +956,11 @@ static void GenerateARCBBTerminatorAnnotation(const char *Name, BasicBlock *BB, /*isVarArg=*/false); Constant *Callee = M->getOrInsertFunction(Name, FTy); - IRBuilder<> Builder(BB, llvm::prior(BB->end())); + IRBuilder<> Builder(BB, std::prev(BB->end())); Value *PtrName; StringRef Tmp = Ptr->getName(); - if (0 == (PtrName = M->getGlobalVariable(Tmp, true))) { + if (nullptr == (PtrName = M->getGlobalVariable(Tmp, true))) { Value *ActualPtrName = Builder.CreateGlobalStringPtr(Tmp, Tmp + "_STR"); PtrName = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage, @@ -972,7 +969,7 @@ static void GenerateARCBBTerminatorAnnotation(const char *Name, BasicBlock *BB, Value *S; std::string SeqStr = SequenceToString(Seq); - if (0 == (S = M->getGlobalVariable(SeqStr, true))) { + if (nullptr == (S = M->getGlobalVariable(SeqStr, true))) { Value *ActualPtrName = Builder.CreateGlobalStringPtr(SeqStr, SeqStr + "_STR"); S = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage, @@ -1006,7 +1003,7 @@ static void GenerateARCAnnotation(unsigned InstMDId, // llvm-arc-annotation-processor tool to cross reference where the source // pointer is in the LLVM IR since the LLVM IR parser does not submit such // information via debug info for backends to use (since why would anyone - // need such a thing from LLVM IR besides in non standard cases + // need such a thing from LLVM IR besides in non-standard cases // [i.e. this]). MDString *SourcePtrMDNode = AppendMDNodeToSourcePtr(PtrMDId, Ptr); @@ -1164,10 +1161,10 @@ namespace { void GatherStatistics(Function &F, bool AfterOptimization = false); #endif - virtual void getAnalysisUsage(AnalysisUsage &AU) const; - virtual bool doInitialization(Module &M); - virtual bool runOnFunction(Function &F); - virtual void releaseMemory(); + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + void releaseMemory() override; public: static char ID; @@ -1267,13 +1264,11 @@ ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV, Users.push_back(Ptr); do { Ptr = Users.pop_back_val(); - for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end(); - UI != UE; ++UI) { - const User *I = *UI; - if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV) + for (const User *U : Ptr->users()) { + if (isa<ReturnInst>(U) || GetBasicInstructionClass(U) == IC_RetainRV) return; - if (isa<BitCastInst>(I)) - Users.push_back(I); + if (isa<BitCastInst>(U)) + Users.push_back(U); } } while (!Users.empty()); @@ -1724,7 +1719,7 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, BBState &MyStates) { bool NestingDetected = false; InstructionClass Class = GetInstructionClass(Inst); - const Value *Arg = 0; + const Value *Arg = nullptr; DEBUG(dbgs() << "Class: " << Class << "\n"); @@ -1809,13 +1804,13 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, // pointer has multiple owners implying that we must be more conservative. // // This comes up in the context of a pointer being ``KnownSafe''. In the - // presense of a block being initialized, the frontend will emit the + // presence of a block being initialized, the frontend will emit the // objc_retain on the original pointer and the release on the pointer loaded // from the alloca. The optimizer will through the provenance analysis // realize that the two are related, but since we only require KnownSafe in // one direction, will match the inner retain on the original pointer with // the guard release on the original pointer. This is fixed by ensuring that - // in the presense of allocas we only unconditionally remove pointers if + // in the presence of allocas we only unconditionally remove pointers if // both our retain and our release are KnownSafe. if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { if (AreAnyUnderlyingObjectsAnAlloca(SI->getPointerOperand())) { @@ -1875,7 +1870,7 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, if (isa<InvokeInst>(Inst)) S.InsertReverseInsertPt(BB->getFirstInsertionPt()); else - S.InsertReverseInsertPt(llvm::next(BasicBlock::iterator(Inst))); + S.InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst))); S.SetSeq(S_Use); ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S_Use); } else if (Seq == S_Release && IsUser(Class)) { @@ -1889,7 +1884,7 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, if (isa<InvokeInst>(Inst)) S.InsertReverseInsertPt(BB->getFirstInsertionPt()); else - S.InsertReverseInsertPt(llvm::next(BasicBlock::iterator(Inst))); + S.InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst))); } break; case S_Stop: @@ -1946,7 +1941,7 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, // Visit all the instructions, bottom-up. for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) { - Instruction *Inst = llvm::prior(I); + Instruction *Inst = std::prev(I); // Invoke instructions are visited as part of their successors (below). if (isa<InvokeInst>(Inst)) @@ -1980,7 +1975,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, BBState &MyStates) { bool NestingDetected = false; InstructionClass Class = GetInstructionClass(Inst); - const Value *Arg = 0; + const Value *Arg = nullptr; switch (Class) { case IC_RetainBlock: @@ -2032,7 +2027,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, switch (OldSeq) { case S_Retain: case S_CanRelease: - if (OldSeq == S_Retain || ReleaseMetadata != 0) + if (OldSeq == S_Retain || ReleaseMetadata != nullptr) S.ClearReverseInsertPts(); // FALL THROUGH case S_Use: @@ -2438,7 +2433,7 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> } else { if (ReleasesToMove.ReleaseMetadata != NewRetainReleaseRRI.ReleaseMetadata) - ReleasesToMove.ReleaseMetadata = 0; + ReleasesToMove.ReleaseMetadata = nullptr; if (ReleasesToMove.IsTailCallRelease != NewRetainReleaseRRI.IsTailCallRelease) ReleasesToMove.IsTailCallRelease = false; @@ -2692,12 +2687,12 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { // within the same block. Theoretically, we could do memdep-style non-local // analysis too, but that would want caching. A better approach would be to // use the technique that EarlyCSE uses. - inst_iterator Current = llvm::prior(I); + inst_iterator Current = std::prev(I); BasicBlock *CurrentBB = Current.getBasicBlockIterator(); for (BasicBlock::iterator B = CurrentBB->begin(), J = Current.getInstructionIterator(); J != B; --J) { - Instruction *EarlierInst = &*llvm::prior(J); + Instruction *EarlierInst = &*std::prev(J); InstructionClass EarlierClass = GetInstructionClass(EarlierInst); switch (EarlierClass) { case IC_LoadWeak: @@ -2788,9 +2783,8 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { CallInst *Call = cast<CallInst>(Inst); Value *Arg = Call->getArgOperand(0); if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) { - for (Value::use_iterator UI = Alloca->use_begin(), - UE = Alloca->use_end(); UI != UE; ++UI) { - const Instruction *UserInst = cast<Instruction>(*UI); + for (User *U : Alloca->users()) { + const Instruction *UserInst = cast<Instruction>(U); switch (GetBasicInstructionClass(UserInst)) { case IC_InitWeak: case IC_StoreWeak: @@ -2801,8 +2795,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { } } Changed = true; - for (Value::use_iterator UI = Alloca->use_begin(), - UE = Alloca->use_end(); UI != UE; ) { + for (auto UI = Alloca->user_begin(), UE = Alloca->user_end(); UI != UE;) { CallInst *UserInst = cast<CallInst>(*UI++); switch (GetBasicInstructionClass(UserInst)) { case IC_InitWeak: @@ -2892,7 +2885,7 @@ FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB, FindDependencies(CanChangeRetainCount, Arg, BB, Autorelease, DepInsts, Visited, PA); if (DepInsts.size() != 1) - return 0; + return nullptr; CallInst *Retain = dyn_cast_or_null<CallInst>(*DepInsts.begin()); @@ -2901,7 +2894,7 @@ FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB, if (!Retain || !IsRetain(GetBasicInstructionClass(Retain)) || GetObjCArg(Retain) != Arg) { - return 0; + return nullptr; } return Retain; @@ -2919,17 +2912,17 @@ FindPredecessorAutoreleaseWithSafePath(const Value *Arg, BasicBlock *BB, FindDependencies(NeedsPositiveRetainCount, Arg, BB, Ret, DepInsts, V, PA); if (DepInsts.size() != 1) - return 0; + return nullptr; CallInst *Autorelease = dyn_cast_or_null<CallInst>(*DepInsts.begin()); if (!Autorelease) - return 0; + return nullptr; InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease); if (!IsAutorelease(AutoreleaseClass)) - return 0; + return nullptr; if (GetObjCArg(Autorelease) != Arg) - return 0; + return nullptr; return Autorelease; } diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp index ae3c628..22be6fd 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp @@ -79,11 +79,10 @@ static bool IsStoredObjCPointer(const Value *P) { Visited.insert(P); do { P = Worklist.pop_back_val(); - for (Value::const_use_iterator UI = P->use_begin(), UE = P->use_end(); - UI != UE; ++UI) { - const User *Ur = *UI; + for (const Use &U : P->uses()) { + const User *Ur = U.getUser(); if (isa<StoreInst>(Ur)) { - if (UI.getOperandNo() == 0) + if (U.getOperandNo() == 0) // The pointer is stored. return true; // The pointed is stored through. diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp index a3eb07a9..1a3a4aa 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -14,20 +14,21 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "adce" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/InstIterator.h" using namespace llvm; +#define DEBUG_TYPE "adce" + STATISTIC(NumRemoved, "Number of instructions removed"); namespace { @@ -37,9 +38,9 @@ namespace { initializeADCEPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnFunction(Function& F); + bool runOnFunction(Function& F) override; - virtual void getAnalysisUsage(AnalysisUsage& AU) const { + void getAnalysisUsage(AnalysisUsage& AU) const override { AU.setPreservesCFG(); } @@ -50,6 +51,9 @@ char ADCE::ID = 0; INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false) bool ADCE::runOnFunction(Function& F) { + if (skipOptnoneFunction(F)) + return false; + SmallPtrSet<Instruction*, 128> alive; SmallVector<Instruction*, 128> worklist; diff --git a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp deleted file mode 100644 index 007e9b7..0000000 --- a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ /dev/null @@ -1,2002 +0,0 @@ -//===- CodeGenPrepare.cpp - Prepare a function for code generation --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass munges the code in the input function to better prepare it for -// SelectionDAG-based code generation. This works around limitations in it's -// basic-block-at-a-time approach. It should eventually be removed. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "codegenprepare" -#include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/ValueMap.h" -#include "llvm/Analysis/DominatorInternals.h" -#include "llvm/Analysis/Dominators.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Assembly/Writer.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InlineAsm.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/Pass.h" -#include "llvm/Support/CallSite.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/GetElementPtrTypeIterator.h" -#include "llvm/Support/PatternMatch.h" -#include "llvm/Support/ValueHandle.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/BuildLibCalls.h" -#include "llvm/Transforms/Utils/BypassSlowDivision.h" -#include "llvm/Transforms/Utils/Local.h" -using namespace llvm; -using namespace llvm::PatternMatch; - -STATISTIC(NumBlocksElim, "Number of blocks eliminated"); -STATISTIC(NumPHIsElim, "Number of trivial PHIs eliminated"); -STATISTIC(NumGEPsElim, "Number of GEPs converted to casts"); -STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of " - "sunken Cmps"); -STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses " - "of sunken Casts"); -STATISTIC(NumMemoryInsts, "Number of memory instructions whose address " - "computations were sunk"); -STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); -STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); -STATISTIC(NumRetsDup, "Number of return instructions duplicated"); -STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); -STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); - -static cl::opt<bool> DisableBranchOpts( - "disable-cgp-branch-opts", cl::Hidden, cl::init(false), - cl::desc("Disable branch optimizations in CodeGenPrepare")); - -static cl::opt<bool> DisableSelectToBranch( - "disable-cgp-select2branch", cl::Hidden, cl::init(false), - cl::desc("Disable select to branch conversion.")); - -namespace { - class CodeGenPrepare : public FunctionPass { - /// TLI - Keep a pointer of a TargetLowering to consult for determining - /// transformation profitability. - const TargetMachine *TM; - const TargetLowering *TLI; - const TargetLibraryInfo *TLInfo; - DominatorTree *DT; - - /// CurInstIterator - As we scan instructions optimizing them, this is the - /// next instruction to optimize. Xforms that can invalidate this should - /// update it. - BasicBlock::iterator CurInstIterator; - - /// Keeps track of non-local addresses that have been sunk into a block. - /// This allows us to avoid inserting duplicate code for blocks with - /// multiple load/stores of the same address. - ValueMap<Value*, Value*> SunkAddrs; - - /// ModifiedDT - If CFG is modified in anyway, dominator tree may need to - /// be updated. - bool ModifiedDT; - - /// OptSize - True if optimizing for size. - bool OptSize; - - public: - static char ID; // Pass identification, replacement for typeid - explicit CodeGenPrepare(const TargetMachine *TM = 0) - : FunctionPass(ID), TM(TM), TLI(0) { - initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); - } - bool runOnFunction(Function &F); - - const char *getPassName() const { return "CodeGen Prepare"; } - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addPreserved<DominatorTree>(); - AU.addRequired<TargetLibraryInfo>(); - } - - private: - bool EliminateFallThrough(Function &F); - bool EliminateMostlyEmptyBlocks(Function &F); - bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; - void EliminateMostlyEmptyBlock(BasicBlock *BB); - bool OptimizeBlock(BasicBlock &BB); - bool OptimizeInst(Instruction *I); - bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy); - bool OptimizeInlineAsmInst(CallInst *CS); - bool OptimizeCallInst(CallInst *CI); - bool MoveExtToFormExtLoad(Instruction *I); - bool OptimizeExtUses(Instruction *I); - bool OptimizeSelectInst(SelectInst *SI); - bool DupRetToEnableTailCallOpts(BasicBlock *BB); - bool PlaceDbgValues(Function &F); - }; -} - -char CodeGenPrepare::ID = 0; -INITIALIZE_PASS_BEGIN(CodeGenPrepare, "codegenprepare", - "Optimize for code generation", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) -INITIALIZE_PASS_END(CodeGenPrepare, "codegenprepare", - "Optimize for code generation", false, false) - -FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) { - return new CodeGenPrepare(TM); -} - -bool CodeGenPrepare::runOnFunction(Function &F) { - bool EverMadeChange = false; - - ModifiedDT = false; - if (TM) TLI = TM->getTargetLowering(); - TLInfo = &getAnalysis<TargetLibraryInfo>(); - DT = getAnalysisIfAvailable<DominatorTree>(); - OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize); - - /// This optimization identifies DIV instructions that can be - /// profitably bypassed and carried out with a shorter, faster divide. - if (!OptSize && TLI && TLI->isSlowDivBypassed()) { - const DenseMap<unsigned int, unsigned int> &BypassWidths = - TLI->getBypassSlowDivWidths(); - for (Function::iterator I = F.begin(); I != F.end(); I++) - EverMadeChange |= bypassSlowDivision(F, I, BypassWidths); - } - - // Eliminate blocks that contain only PHI nodes and an - // unconditional branch. - EverMadeChange |= EliminateMostlyEmptyBlocks(F); - - // llvm.dbg.value is far away from the value then iSel may not be able - // handle it properly. iSel will drop llvm.dbg.value if it can not - // find a node corresponding to the value. - EverMadeChange |= PlaceDbgValues(F); - - bool MadeChange = true; - while (MadeChange) { - MadeChange = false; - for (Function::iterator I = F.begin(); I != F.end(); ) { - BasicBlock *BB = I++; - MadeChange |= OptimizeBlock(*BB); - } - EverMadeChange |= MadeChange; - } - - SunkAddrs.clear(); - - if (!DisableBranchOpts) { - MadeChange = false; - SmallPtrSet<BasicBlock*, 8> WorkList; - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB)); - MadeChange |= ConstantFoldTerminator(BB, true); - if (!MadeChange) continue; - - for (SmallVectorImpl<BasicBlock*>::iterator - II = Successors.begin(), IE = Successors.end(); II != IE; ++II) - if (pred_begin(*II) == pred_end(*II)) - WorkList.insert(*II); - } - - // Delete the dead blocks and any of their dead successors. - MadeChange |= !WorkList.empty(); - while (!WorkList.empty()) { - BasicBlock *BB = *WorkList.begin(); - WorkList.erase(BB); - SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB)); - - DeleteDeadBlock(BB); - - for (SmallVectorImpl<BasicBlock*>::iterator - II = Successors.begin(), IE = Successors.end(); II != IE; ++II) - if (pred_begin(*II) == pred_end(*II)) - WorkList.insert(*II); - } - - // Merge pairs of basic blocks with unconditional branches, connected by - // a single edge. - if (EverMadeChange || MadeChange) - MadeChange |= EliminateFallThrough(F); - - if (MadeChange) - ModifiedDT = true; - EverMadeChange |= MadeChange; - } - - if (ModifiedDT && DT) - DT->DT->recalculate(F); - - return EverMadeChange; -} - -/// EliminateFallThrough - Merge basic blocks which are connected -/// by a single edge, where one of the basic blocks has a single successor -/// pointing to the other basic block, which has a single predecessor. -bool CodeGenPrepare::EliminateFallThrough(Function &F) { - bool Changed = false; - // Scan all of the blocks in the function, except for the entry block. - for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ) { - BasicBlock *BB = I++; - // If the destination block has a single pred, then this is a trivial - // edge, just collapse it. - BasicBlock *SinglePred = BB->getSinglePredecessor(); - - // Don't merge if BB's address is taken. - if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue; - - BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator()); - if (Term && !Term->isConditional()) { - Changed = true; - DEBUG(dbgs() << "To merge:\n"<< *SinglePred << "\n\n\n"); - // Remember if SinglePred was the entry block of the function. - // If so, we will need to move BB back to the entry position. - bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); - MergeBasicBlockIntoOnlyPred(BB, this); - - if (isEntry && BB != &BB->getParent()->getEntryBlock()) - BB->moveBefore(&BB->getParent()->getEntryBlock()); - - // We have erased a block. Update the iterator. - I = BB; - } - } - return Changed; -} - -/// EliminateMostlyEmptyBlocks - eliminate blocks that contain only PHI nodes, -/// debug info directives, and an unconditional branch. Passes before isel -/// (e.g. LSR/loopsimplify) often split edges in ways that are non-optimal for -/// isel. Start by eliminating these blocks so we can split them the way we -/// want them. -bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) { - bool MadeChange = false; - // Note that this intentionally skips the entry block. - for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ) { - BasicBlock *BB = I++; - - // If this block doesn't end with an uncond branch, ignore it. - BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); - if (!BI || !BI->isUnconditional()) - continue; - - // If the instruction before the branch (skipping debug info) isn't a phi - // node, then other stuff is happening here. - BasicBlock::iterator BBI = BI; - if (BBI != BB->begin()) { - --BBI; - while (isa<DbgInfoIntrinsic>(BBI)) { - if (BBI == BB->begin()) - break; - --BBI; - } - if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI)) - continue; - } - - // Do not break infinite loops. - BasicBlock *DestBB = BI->getSuccessor(0); - if (DestBB == BB) - continue; - - if (!CanMergeBlocks(BB, DestBB)) - continue; - - EliminateMostlyEmptyBlock(BB); - MadeChange = true; - } - return MadeChange; -} - -/// CanMergeBlocks - Return true if we can merge BB into DestBB if there is a -/// single uncond branch between them, and BB contains no other non-phi -/// instructions. -bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB, - const BasicBlock *DestBB) const { - // We only want to eliminate blocks whose phi nodes are used by phi nodes in - // the successor. If there are more complex condition (e.g. preheaders), - // don't mess around with them. - BasicBlock::const_iterator BBI = BB->begin(); - while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) { - for (Value::const_use_iterator UI = PN->use_begin(), E = PN->use_end(); - UI != E; ++UI) { - const Instruction *User = cast<Instruction>(*UI); - if (User->getParent() != DestBB || !isa<PHINode>(User)) - return false; - // If User is inside DestBB block and it is a PHINode then check - // incoming value. If incoming value is not from BB then this is - // a complex condition (e.g. preheaders) we want to avoid here. - if (User->getParent() == DestBB) { - if (const PHINode *UPN = dyn_cast<PHINode>(User)) - for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) { - Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I)); - if (Insn && Insn->getParent() == BB && - Insn->getParent() != UPN->getIncomingBlock(I)) - return false; - } - } - } - } - - // If BB and DestBB contain any common predecessors, then the phi nodes in BB - // and DestBB may have conflicting incoming values for the block. If so, we - // can't merge the block. - const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin()); - if (!DestBBPN) return true; // no conflict. - - // Collect the preds of BB. - SmallPtrSet<const BasicBlock*, 16> BBPreds; - if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) { - // It is faster to get preds from a PHI than with pred_iterator. - for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i) - BBPreds.insert(BBPN->getIncomingBlock(i)); - } else { - BBPreds.insert(pred_begin(BB), pred_end(BB)); - } - - // Walk the preds of DestBB. - for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) { - BasicBlock *Pred = DestBBPN->getIncomingBlock(i); - if (BBPreds.count(Pred)) { // Common predecessor? - BBI = DestBB->begin(); - while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) { - const Value *V1 = PN->getIncomingValueForBlock(Pred); - const Value *V2 = PN->getIncomingValueForBlock(BB); - - // If V2 is a phi node in BB, look up what the mapped value will be. - if (const PHINode *V2PN = dyn_cast<PHINode>(V2)) - if (V2PN->getParent() == BB) - V2 = V2PN->getIncomingValueForBlock(Pred); - - // If there is a conflict, bail out. - if (V1 != V2) return false; - } - } - } - - return true; -} - - -/// EliminateMostlyEmptyBlock - Eliminate a basic block that have only phi's and -/// an unconditional branch in it. -void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) { - BranchInst *BI = cast<BranchInst>(BB->getTerminator()); - BasicBlock *DestBB = BI->getSuccessor(0); - - DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" << *BB << *DestBB); - - // If the destination block has a single pred, then this is a trivial edge, - // just collapse it. - if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) { - if (SinglePred != DestBB) { - // Remember if SinglePred was the entry block of the function. If so, we - // will need to move BB back to the entry position. - bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); - MergeBasicBlockIntoOnlyPred(DestBB, this); - - if (isEntry && BB != &BB->getParent()->getEntryBlock()) - BB->moveBefore(&BB->getParent()->getEntryBlock()); - - DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); - return; - } - } - - // Otherwise, we have multiple predecessors of BB. Update the PHIs in DestBB - // to handle the new incoming edges it is about to have. - PHINode *PN; - for (BasicBlock::iterator BBI = DestBB->begin(); - (PN = dyn_cast<PHINode>(BBI)); ++BBI) { - // Remove the incoming value for BB, and remember it. - Value *InVal = PN->removeIncomingValue(BB, false); - - // Two options: either the InVal is a phi node defined in BB or it is some - // value that dominates BB. - PHINode *InValPhi = dyn_cast<PHINode>(InVal); - if (InValPhi && InValPhi->getParent() == BB) { - // Add all of the input values of the input PHI as inputs of this phi. - for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i) - PN->addIncoming(InValPhi->getIncomingValue(i), - InValPhi->getIncomingBlock(i)); - } else { - // Otherwise, add one instance of the dominating value for each edge that - // we will be adding. - if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) { - for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i) - PN->addIncoming(InVal, BBPN->getIncomingBlock(i)); - } else { - for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) - PN->addIncoming(InVal, *PI); - } - } - } - - // The PHIs are now updated, change everything that refers to BB to use - // DestBB and remove BB. - BB->replaceAllUsesWith(DestBB); - if (DT && !ModifiedDT) { - BasicBlock *BBIDom = DT->getNode(BB)->getIDom()->getBlock(); - BasicBlock *DestBBIDom = DT->getNode(DestBB)->getIDom()->getBlock(); - BasicBlock *NewIDom = DT->findNearestCommonDominator(BBIDom, DestBBIDom); - DT->changeImmediateDominator(DestBB, NewIDom); - DT->eraseNode(BB); - } - BB->eraseFromParent(); - ++NumBlocksElim; - - DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); -} - -/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop -/// copy (e.g. it's casting from one pointer type to another, i32->i8 on PPC), -/// sink it into user blocks to reduce the number of virtual -/// registers that must be created and coalesced. -/// -/// Return true if any changes are made. -/// -static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){ - // If this is a noop copy, - EVT SrcVT = TLI.getValueType(CI->getOperand(0)->getType()); - EVT DstVT = TLI.getValueType(CI->getType()); - - // This is an fp<->int conversion? - if (SrcVT.isInteger() != DstVT.isInteger()) - return false; - - // If this is an extension, it will be a zero or sign extension, which - // isn't a noop. - if (SrcVT.bitsLT(DstVT)) return false; - - // If these values will be promoted, find out what they will be promoted - // to. This helps us consider truncates on PPC as noop copies when they - // are. - if (TLI.getTypeAction(CI->getContext(), SrcVT) == - TargetLowering::TypePromoteInteger) - SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT); - if (TLI.getTypeAction(CI->getContext(), DstVT) == - TargetLowering::TypePromoteInteger) - DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT); - - // If, after promotion, these are the same types, this is a noop copy. - if (SrcVT != DstVT) - return false; - - BasicBlock *DefBB = CI->getParent(); - - /// InsertedCasts - Only insert a cast in each block once. - DenseMap<BasicBlock*, CastInst*> InsertedCasts; - - bool MadeChange = false; - for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end(); - UI != E; ) { - Use &TheUse = UI.getUse(); - Instruction *User = cast<Instruction>(*UI); - - // Figure out which BB this cast is used in. For PHI's this is the - // appropriate predecessor block. - BasicBlock *UserBB = User->getParent(); - if (PHINode *PN = dyn_cast<PHINode>(User)) { - UserBB = PN->getIncomingBlock(UI); - } - - // Preincrement use iterator so we don't invalidate it. - ++UI; - - // If this user is in the same block as the cast, don't change the cast. - if (UserBB == DefBB) continue; - - // If we have already inserted a cast into this block, use it. - CastInst *&InsertedCast = InsertedCasts[UserBB]; - - if (!InsertedCast) { - BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); - InsertedCast = - CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "", - InsertPt); - MadeChange = true; - } - - // Replace a use of the cast with a use of the new cast. - TheUse = InsertedCast; - ++NumCastUses; - } - - // If we removed all uses, nuke the cast. - if (CI->use_empty()) { - CI->eraseFromParent(); - MadeChange = true; - } - - return MadeChange; -} - -/// OptimizeCmpExpression - sink the given CmpInst into user blocks to reduce -/// the number of virtual registers that must be created and coalesced. This is -/// a clear win except on targets with multiple condition code registers -/// (PowerPC), where it might lose; some adjustment may be wanted there. -/// -/// Return true if any changes are made. -static bool OptimizeCmpExpression(CmpInst *CI) { - BasicBlock *DefBB = CI->getParent(); - - /// InsertedCmp - Only insert a cmp in each block once. - DenseMap<BasicBlock*, CmpInst*> InsertedCmps; - - bool MadeChange = false; - for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end(); - UI != E; ) { - Use &TheUse = UI.getUse(); - Instruction *User = cast<Instruction>(*UI); - - // Preincrement use iterator so we don't invalidate it. - ++UI; - - // Don't bother for PHI nodes. - if (isa<PHINode>(User)) - continue; - - // Figure out which BB this cmp is used in. - BasicBlock *UserBB = User->getParent(); - - // If this user is in the same block as the cmp, don't change the cmp. - if (UserBB == DefBB) continue; - - // If we have already inserted a cmp into this block, use it. - CmpInst *&InsertedCmp = InsertedCmps[UserBB]; - - if (!InsertedCmp) { - BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); - InsertedCmp = - CmpInst::Create(CI->getOpcode(), - CI->getPredicate(), CI->getOperand(0), - CI->getOperand(1), "", InsertPt); - MadeChange = true; - } - - // Replace a use of the cmp with a use of the new cmp. - TheUse = InsertedCmp; - ++NumCmpUses; - } - - // If we removed all uses, nuke the cmp. - if (CI->use_empty()) - CI->eraseFromParent(); - - return MadeChange; -} - -namespace { -class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls { -protected: - void replaceCall(Value *With) { - CI->replaceAllUsesWith(With); - CI->eraseFromParent(); - } - bool isFoldable(unsigned SizeCIOp, unsigned, bool) const { - if (ConstantInt *SizeCI = - dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp))) - return SizeCI->isAllOnesValue(); - return false; - } -}; -} // end anonymous namespace - -bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { - BasicBlock *BB = CI->getParent(); - - // Lower inline assembly if we can. - // If we found an inline asm expession, and if the target knows how to - // lower it to normal LLVM code, do so now. - if (TLI && isa<InlineAsm>(CI->getCalledValue())) { - if (TLI->ExpandInlineAsm(CI)) { - // Avoid invalidating the iterator. - CurInstIterator = BB->begin(); - // Avoid processing instructions out of order, which could cause - // reuse before a value is defined. - SunkAddrs.clear(); - return true; - } - // Sink address computing for memory operands into the block. - if (OptimizeInlineAsmInst(CI)) - return true; - } - - // Lower all uses of llvm.objectsize.* - IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); - if (II && II->getIntrinsicID() == Intrinsic::objectsize) { - bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1); - Type *ReturnTy = CI->getType(); - Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL); - - // Substituting this can cause recursive simplifications, which can - // invalidate our iterator. Use a WeakVH to hold onto it in case this - // happens. - WeakVH IterHandle(CurInstIterator); - - replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getDataLayout() : 0, - TLInfo, ModifiedDT ? 0 : DT); - - // If the iterator instruction was recursively deleted, start over at the - // start of the block. - if (IterHandle != CurInstIterator) { - CurInstIterator = BB->begin(); - SunkAddrs.clear(); - } - return true; - } - - if (II && TLI) { - SmallVector<Value*, 2> PtrOps; - Type *AccessTy; - if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy)) - while (!PtrOps.empty()) - if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy)) - return true; - } - - // From here on out we're working with named functions. - if (CI->getCalledFunction() == 0) return false; - - // We'll need DataLayout from here on out. - const DataLayout *TD = TLI ? TLI->getDataLayout() : 0; - if (!TD) return false; - - // Lower all default uses of _chk calls. This is very similar - // to what InstCombineCalls does, but here we are only lowering calls - // that have the default "don't know" as the objectsize. Anything else - // should be left alone. - CodeGenPrepareFortifiedLibCalls Simplifier; - return Simplifier.fold(CI, TD, TLInfo); -} - -/// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return -/// instructions to the predecessor to enable tail call optimizations. The -/// case it is currently looking for is: -/// @code -/// bb0: -/// %tmp0 = tail call i32 @f0() -/// br label %return -/// bb1: -/// %tmp1 = tail call i32 @f1() -/// br label %return -/// bb2: -/// %tmp2 = tail call i32 @f2() -/// br label %return -/// return: -/// %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ] -/// ret i32 %retval -/// @endcode -/// -/// => -/// -/// @code -/// bb0: -/// %tmp0 = tail call i32 @f0() -/// ret i32 %tmp0 -/// bb1: -/// %tmp1 = tail call i32 @f1() -/// ret i32 %tmp1 -/// bb2: -/// %tmp2 = tail call i32 @f2() -/// ret i32 %tmp2 -/// @endcode -bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) { - if (!TLI) - return false; - - ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()); - if (!RI) - return false; - - PHINode *PN = 0; - BitCastInst *BCI = 0; - Value *V = RI->getReturnValue(); - if (V) { - BCI = dyn_cast<BitCastInst>(V); - if (BCI) - V = BCI->getOperand(0); - - PN = dyn_cast<PHINode>(V); - if (!PN) - return false; - } - - if (PN && PN->getParent() != BB) - return false; - - // It's not safe to eliminate the sign / zero extension of the return value. - // See llvm::isInTailCallPosition(). - const Function *F = BB->getParent(); - AttributeSet CallerAttrs = F->getAttributes(); - if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) || - CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt)) - return false; - - // Make sure there are no instructions between the PHI and return, or that the - // return is the first instruction in the block. - if (PN) { - BasicBlock::iterator BI = BB->begin(); - do { ++BI; } while (isa<DbgInfoIntrinsic>(BI)); - if (&*BI == BCI) - // Also skip over the bitcast. - ++BI; - if (&*BI != RI) - return false; - } else { - BasicBlock::iterator BI = BB->begin(); - while (isa<DbgInfoIntrinsic>(BI)) ++BI; - if (&*BI != RI) - return false; - } - - /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail - /// call. - SmallVector<CallInst*, 4> TailCalls; - if (PN) { - for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) { - CallInst *CI = dyn_cast<CallInst>(PN->getIncomingValue(I)); - // Make sure the phi value is indeed produced by the tail call. - if (CI && CI->hasOneUse() && CI->getParent() == PN->getIncomingBlock(I) && - TLI->mayBeEmittedAsTailCall(CI)) - TailCalls.push_back(CI); - } - } else { - SmallPtrSet<BasicBlock*, 4> VisitedBBs; - for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { - if (!VisitedBBs.insert(*PI)) - continue; - - BasicBlock::InstListType &InstList = (*PI)->getInstList(); - BasicBlock::InstListType::reverse_iterator RI = InstList.rbegin(); - BasicBlock::InstListType::reverse_iterator RE = InstList.rend(); - do { ++RI; } while (RI != RE && isa<DbgInfoIntrinsic>(&*RI)); - if (RI == RE) - continue; - - CallInst *CI = dyn_cast<CallInst>(&*RI); - if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI)) - TailCalls.push_back(CI); - } - } - - bool Changed = false; - for (unsigned i = 0, e = TailCalls.size(); i != e; ++i) { - CallInst *CI = TailCalls[i]; - CallSite CS(CI); - - // Conservatively require the attributes of the call to match those of the - // return. Ignore noalias because it doesn't affect the call sequence. - AttributeSet CalleeAttrs = CS.getAttributes(); - if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex). - removeAttribute(Attribute::NoAlias) != - AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex). - removeAttribute(Attribute::NoAlias)) - continue; - - // Make sure the call instruction is followed by an unconditional branch to - // the return block. - BasicBlock *CallBB = CI->getParent(); - BranchInst *BI = dyn_cast<BranchInst>(CallBB->getTerminator()); - if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB) - continue; - - // Duplicate the return into CallBB. - (void)FoldReturnIntoUncondBranch(RI, BB, CallBB); - ModifiedDT = Changed = true; - ++NumRetsDup; - } - - // If we eliminated all predecessors of the block, delete the block now. - if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB)) - BB->eraseFromParent(); - - return Changed; -} - -//===----------------------------------------------------------------------===// -// Memory Optimization -//===----------------------------------------------------------------------===// - -namespace { - -/// ExtAddrMode - This is an extended version of TargetLowering::AddrMode -/// which holds actual Value*'s for register values. -struct ExtAddrMode : public TargetLowering::AddrMode { - Value *BaseReg; - Value *ScaledReg; - ExtAddrMode() : BaseReg(0), ScaledReg(0) {} - void print(raw_ostream &OS) const; - void dump() const; - - bool operator==(const ExtAddrMode& O) const { - return (BaseReg == O.BaseReg) && (ScaledReg == O.ScaledReg) && - (BaseGV == O.BaseGV) && (BaseOffs == O.BaseOffs) && - (HasBaseReg == O.HasBaseReg) && (Scale == O.Scale); - } -}; - -#ifndef NDEBUG -static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) { - AM.print(OS); - return OS; -} -#endif - -void ExtAddrMode::print(raw_ostream &OS) const { - bool NeedPlus = false; - OS << "["; - if (BaseGV) { - OS << (NeedPlus ? " + " : "") - << "GV:"; - WriteAsOperand(OS, BaseGV, /*PrintType=*/false); - NeedPlus = true; - } - - if (BaseOffs) - OS << (NeedPlus ? " + " : "") << BaseOffs, NeedPlus = true; - - if (BaseReg) { - OS << (NeedPlus ? " + " : "") - << "Base:"; - WriteAsOperand(OS, BaseReg, /*PrintType=*/false); - NeedPlus = true; - } - if (Scale) { - OS << (NeedPlus ? " + " : "") - << Scale << "*"; - WriteAsOperand(OS, ScaledReg, /*PrintType=*/false); - } - - OS << ']'; -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void ExtAddrMode::dump() const { - print(dbgs()); - dbgs() << '\n'; -} -#endif - - -/// \brief A helper class for matching addressing modes. -/// -/// This encapsulates the logic for matching the target-legal addressing modes. -class AddressingModeMatcher { - SmallVectorImpl<Instruction*> &AddrModeInsts; - const TargetLowering &TLI; - - /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and - /// the memory instruction that we're computing this address for. - Type *AccessTy; - Instruction *MemoryInst; - - /// AddrMode - This is the addressing mode that we're building up. This is - /// part of the return value of this addressing mode matching stuff. - ExtAddrMode &AddrMode; - - /// IgnoreProfitability - This is set to true when we should not do - /// profitability checks. When true, IsProfitableToFoldIntoAddressingMode - /// always returns true. - bool IgnoreProfitability; - - AddressingModeMatcher(SmallVectorImpl<Instruction*> &AMI, - const TargetLowering &T, Type *AT, - Instruction *MI, ExtAddrMode &AM) - : AddrModeInsts(AMI), TLI(T), AccessTy(AT), MemoryInst(MI), AddrMode(AM) { - IgnoreProfitability = false; - } -public: - - /// Match - Find the maximal addressing mode that a load/store of V can fold, - /// give an access type of AccessTy. This returns a list of involved - /// instructions in AddrModeInsts. - static ExtAddrMode Match(Value *V, Type *AccessTy, - Instruction *MemoryInst, - SmallVectorImpl<Instruction*> &AddrModeInsts, - const TargetLowering &TLI) { - ExtAddrMode Result; - - bool Success = - AddressingModeMatcher(AddrModeInsts, TLI, AccessTy, - MemoryInst, Result).MatchAddr(V, 0); - (void)Success; assert(Success && "Couldn't select *anything*?"); - return Result; - } -private: - bool MatchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth); - bool MatchAddr(Value *V, unsigned Depth); - bool MatchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth); - bool IsProfitableToFoldIntoAddressingMode(Instruction *I, - ExtAddrMode &AMBefore, - ExtAddrMode &AMAfter); - bool ValueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2); -}; - -/// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode. -/// Return true and update AddrMode if this addr mode is legal for the target, -/// false if not. -bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, - unsigned Depth) { - // If Scale is 1, then this is the same as adding ScaleReg to the addressing - // mode. Just process that directly. - if (Scale == 1) - return MatchAddr(ScaleReg, Depth); - - // If the scale is 0, it takes nothing to add this. - if (Scale == 0) - return true; - - // If we already have a scale of this value, we can add to it, otherwise, we - // need an available scale field. - if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg) - return false; - - ExtAddrMode TestAddrMode = AddrMode; - - // Add scale to turn X*4+X*3 -> X*7. This could also do things like - // [A+B + A*7] -> [B+A*8]. - TestAddrMode.Scale += Scale; - TestAddrMode.ScaledReg = ScaleReg; - - // If the new address isn't legal, bail out. - if (!TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) - return false; - - // It was legal, so commit it. - AddrMode = TestAddrMode; - - // Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now - // to see if ScaleReg is actually X+C. If so, we can turn this into adding - // X*Scale + C*Scale to addr mode. - ConstantInt *CI = 0; Value *AddLHS = 0; - if (isa<Instruction>(ScaleReg) && // not a constant expr. - match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) { - TestAddrMode.ScaledReg = AddLHS; - TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale; - - // If this addressing mode is legal, commit it and remember that we folded - // this instruction. - if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) { - AddrModeInsts.push_back(cast<Instruction>(ScaleReg)); - AddrMode = TestAddrMode; - return true; - } - } - - // Otherwise, not (x+c)*scale, just return what we have. - return true; -} - -/// MightBeFoldableInst - This is a little filter, which returns true if an -/// addressing computation involving I might be folded into a load/store -/// accessing it. This doesn't need to be perfect, but needs to accept at least -/// the set of instructions that MatchOperationAddr can. -static bool MightBeFoldableInst(Instruction *I) { - switch (I->getOpcode()) { - case Instruction::BitCast: - // Don't touch identity bitcasts. - if (I->getType() == I->getOperand(0)->getType()) - return false; - return I->getType()->isPointerTy() || I->getType()->isIntegerTy(); - case Instruction::PtrToInt: - // PtrToInt is always a noop, as we know that the int type is pointer sized. - return true; - case Instruction::IntToPtr: - // We know the input is intptr_t, so this is foldable. - return true; - case Instruction::Add: - return true; - case Instruction::Mul: - case Instruction::Shl: - // Can only handle X*C and X << C. - return isa<ConstantInt>(I->getOperand(1)); - case Instruction::GetElementPtr: - return true; - default: - return false; - } -} - -/// MatchOperationAddr - Given an instruction or constant expr, see if we can -/// fold the operation into the addressing mode. If so, update the addressing -/// mode and return true, otherwise return false without modifying AddrMode. -bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, - unsigned Depth) { - // Avoid exponential behavior on extremely deep expression trees. - if (Depth >= 5) return false; - - switch (Opcode) { - case Instruction::PtrToInt: - // PtrToInt is always a noop, as we know that the int type is pointer sized. - return MatchAddr(AddrInst->getOperand(0), Depth); - case Instruction::IntToPtr: - // This inttoptr is a no-op if the integer type is pointer sized. - if (TLI.getValueType(AddrInst->getOperand(0)->getType()) == - TLI.getPointerTy(AddrInst->getType()->getPointerAddressSpace())) - return MatchAddr(AddrInst->getOperand(0), Depth); - return false; - case Instruction::BitCast: - // BitCast is always a noop, and we can handle it as long as it is - // int->int or pointer->pointer (we don't want int<->fp or something). - if ((AddrInst->getOperand(0)->getType()->isPointerTy() || - AddrInst->getOperand(0)->getType()->isIntegerTy()) && - // Don't touch identity bitcasts. These were probably put here by LSR, - // and we don't want to mess around with them. Assume it knows what it - // is doing. - AddrInst->getOperand(0)->getType() != AddrInst->getType()) - return MatchAddr(AddrInst->getOperand(0), Depth); - return false; - case Instruction::Add: { - // Check to see if we can merge in the RHS then the LHS. If so, we win. - ExtAddrMode BackupAddrMode = AddrMode; - unsigned OldSize = AddrModeInsts.size(); - if (MatchAddr(AddrInst->getOperand(1), Depth+1) && - MatchAddr(AddrInst->getOperand(0), Depth+1)) - return true; - - // Restore the old addr mode info. - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - - // Otherwise this was over-aggressive. Try merging in the LHS then the RHS. - if (MatchAddr(AddrInst->getOperand(0), Depth+1) && - MatchAddr(AddrInst->getOperand(1), Depth+1)) - return true; - - // Otherwise we definitely can't merge the ADD in. - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - break; - } - //case Instruction::Or: - // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD. - //break; - case Instruction::Mul: - case Instruction::Shl: { - // Can only handle X*C and X << C. - ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1)); - if (!RHS) return false; - int64_t Scale = RHS->getSExtValue(); - if (Opcode == Instruction::Shl) - Scale = 1LL << Scale; - - return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth); - } - case Instruction::GetElementPtr: { - // Scan the GEP. We check it if it contains constant offsets and at most - // one variable offset. - int VariableOperand = -1; - unsigned VariableScale = 0; - - int64_t ConstantOffset = 0; - const DataLayout *TD = TLI.getDataLayout(); - gep_type_iterator GTI = gep_type_begin(AddrInst); - for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) { - if (StructType *STy = dyn_cast<StructType>(*GTI)) { - const StructLayout *SL = TD->getStructLayout(STy); - unsigned Idx = - cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue(); - ConstantOffset += SL->getElementOffset(Idx); - } else { - uint64_t TypeSize = TD->getTypeAllocSize(GTI.getIndexedType()); - if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) { - ConstantOffset += CI->getSExtValue()*TypeSize; - } else if (TypeSize) { // Scales of zero don't do anything. - // We only allow one variable index at the moment. - if (VariableOperand != -1) - return false; - - // Remember the variable index. - VariableOperand = i; - VariableScale = TypeSize; - } - } - } - - // A common case is for the GEP to only do a constant offset. In this case, - // just add it to the disp field and check validity. - if (VariableOperand == -1) { - AddrMode.BaseOffs += ConstantOffset; - if (ConstantOffset == 0 || TLI.isLegalAddressingMode(AddrMode, AccessTy)){ - // Check to see if we can fold the base pointer in too. - if (MatchAddr(AddrInst->getOperand(0), Depth+1)) - return true; - } - AddrMode.BaseOffs -= ConstantOffset; - return false; - } - - // Save the valid addressing mode in case we can't match. - ExtAddrMode BackupAddrMode = AddrMode; - unsigned OldSize = AddrModeInsts.size(); - - // See if the scale and offset amount is valid for this target. - AddrMode.BaseOffs += ConstantOffset; - - // Match the base operand of the GEP. - if (!MatchAddr(AddrInst->getOperand(0), Depth+1)) { - // If it couldn't be matched, just stuff the value in a register. - if (AddrMode.HasBaseReg) { - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - return false; - } - AddrMode.HasBaseReg = true; - AddrMode.BaseReg = AddrInst->getOperand(0); - } - - // Match the remaining variable portion of the GEP. - if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale, - Depth)) { - // If it couldn't be matched, try stuffing the base into a register - // instead of matching it, and retrying the match of the scale. - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - if (AddrMode.HasBaseReg) - return false; - AddrMode.HasBaseReg = true; - AddrMode.BaseReg = AddrInst->getOperand(0); - AddrMode.BaseOffs += ConstantOffset; - if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), - VariableScale, Depth)) { - // If even that didn't work, bail. - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - return false; - } - } - - return true; - } - } - return false; -} - -/// MatchAddr - If we can, try to add the value of 'Addr' into the current -/// addressing mode. If Addr can't be added to AddrMode this returns false and -/// leaves AddrMode unmodified. This assumes that Addr is either a pointer type -/// or intptr_t for the target. -/// -bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { - if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) { - // Fold in immediates if legal for the target. - AddrMode.BaseOffs += CI->getSExtValue(); - if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) - return true; - AddrMode.BaseOffs -= CI->getSExtValue(); - } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) { - // If this is a global variable, try to fold it into the addressing mode. - if (AddrMode.BaseGV == 0) { - AddrMode.BaseGV = GV; - if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) - return true; - AddrMode.BaseGV = 0; - } - } else if (Instruction *I = dyn_cast<Instruction>(Addr)) { - ExtAddrMode BackupAddrMode = AddrMode; - unsigned OldSize = AddrModeInsts.size(); - - // Check to see if it is possible to fold this operation. - if (MatchOperationAddr(I, I->getOpcode(), Depth)) { - // Okay, it's possible to fold this. Check to see if it is actually - // *profitable* to do so. We use a simple cost model to avoid increasing - // register pressure too much. - if (I->hasOneUse() || - IsProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) { - AddrModeInsts.push_back(I); - return true; - } - - // It isn't profitable to do this, roll back. - //cerr << "NOT FOLDING: " << *I; - AddrMode = BackupAddrMode; - AddrModeInsts.resize(OldSize); - } - } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) { - if (MatchOperationAddr(CE, CE->getOpcode(), Depth)) - return true; - } else if (isa<ConstantPointerNull>(Addr)) { - // Null pointer gets folded without affecting the addressing mode. - return true; - } - - // Worse case, the target should support [reg] addressing modes. :) - if (!AddrMode.HasBaseReg) { - AddrMode.HasBaseReg = true; - AddrMode.BaseReg = Addr; - // Still check for legality in case the target supports [imm] but not [i+r]. - if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) - return true; - AddrMode.HasBaseReg = false; - AddrMode.BaseReg = 0; - } - - // If the base register is already taken, see if we can do [r+r]. - if (AddrMode.Scale == 0) { - AddrMode.Scale = 1; - AddrMode.ScaledReg = Addr; - if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) - return true; - AddrMode.Scale = 0; - AddrMode.ScaledReg = 0; - } - // Couldn't match. - return false; -} - -/// IsOperandAMemoryOperand - Check to see if all uses of OpVal by the specified -/// inline asm call are due to memory operands. If so, return true, otherwise -/// return false. -static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, - const TargetLowering &TLI) { - TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI)); - for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { - TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; - - // Compute the constraint code and ConstraintType to use. - TLI.ComputeConstraintToUse(OpInfo, SDValue()); - - // If this asm operand is our Value*, and if it isn't an indirect memory - // operand, we can't fold it! - if (OpInfo.CallOperandVal == OpVal && - (OpInfo.ConstraintType != TargetLowering::C_Memory || - !OpInfo.isIndirect)) - return false; - } - - return true; -} - -/// FindAllMemoryUses - Recursively walk all the uses of I until we find a -/// memory use. If we find an obviously non-foldable instruction, return true. -/// Add the ultimately found memory instructions to MemoryUses. -static bool FindAllMemoryUses(Instruction *I, - SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses, - SmallPtrSet<Instruction*, 16> &ConsideredInsts, - const TargetLowering &TLI) { - // If we already considered this instruction, we're done. - if (!ConsideredInsts.insert(I)) - return false; - - // If this is an obviously unfoldable instruction, bail out. - if (!MightBeFoldableInst(I)) - return true; - - // Loop over all the uses, recursively processing them. - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); - UI != E; ++UI) { - User *U = *UI; - - if (LoadInst *LI = dyn_cast<LoadInst>(U)) { - MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo())); - continue; - } - - if (StoreInst *SI = dyn_cast<StoreInst>(U)) { - unsigned opNo = UI.getOperandNo(); - if (opNo == 0) return true; // Storing addr, not into addr. - MemoryUses.push_back(std::make_pair(SI, opNo)); - continue; - } - - if (CallInst *CI = dyn_cast<CallInst>(U)) { - InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue()); - if (!IA) return true; - - // If this is a memory operand, we're cool, otherwise bail out. - if (!IsOperandAMemoryOperand(CI, IA, I, TLI)) - return true; - continue; - } - - if (FindAllMemoryUses(cast<Instruction>(U), MemoryUses, ConsideredInsts, - TLI)) - return true; - } - - return false; -} - -/// ValueAlreadyLiveAtInst - Retrn true if Val is already known to be live at -/// the use site that we're folding it into. If so, there is no cost to -/// include it in the addressing mode. KnownLive1 and KnownLive2 are two values -/// that we know are live at the instruction already. -bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, - Value *KnownLive2) { - // If Val is either of the known-live values, we know it is live! - if (Val == 0 || Val == KnownLive1 || Val == KnownLive2) - return true; - - // All values other than instructions and arguments (e.g. constants) are live. - if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true; - - // If Val is a constant sized alloca in the entry block, it is live, this is - // true because it is just a reference to the stack/frame pointer, which is - // live for the whole function. - if (AllocaInst *AI = dyn_cast<AllocaInst>(Val)) - if (AI->isStaticAlloca()) - return true; - - // Check to see if this value is already used in the memory instruction's - // block. If so, it's already live into the block at the very least, so we - // can reasonably fold it. - return Val->isUsedInBasicBlock(MemoryInst->getParent()); -} - -/// IsProfitableToFoldIntoAddressingMode - It is possible for the addressing -/// mode of the machine to fold the specified instruction into a load or store -/// that ultimately uses it. However, the specified instruction has multiple -/// uses. Given this, it may actually increase register pressure to fold it -/// into the load. For example, consider this code: -/// -/// X = ... -/// Y = X+1 -/// use(Y) -> nonload/store -/// Z = Y+1 -/// load Z -/// -/// In this case, Y has multiple uses, and can be folded into the load of Z -/// (yielding load [X+2]). However, doing this will cause both "X" and "X+1" to -/// be live at the use(Y) line. If we don't fold Y into load Z, we use one -/// fewer register. Since Y can't be folded into "use(Y)" we don't increase the -/// number of computations either. -/// -/// Note that this (like most of CodeGenPrepare) is just a rough heuristic. If -/// X was live across 'load Z' for other reasons, we actually *would* want to -/// fold the addressing mode in the Z case. This would make Y die earlier. -bool AddressingModeMatcher:: -IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, - ExtAddrMode &AMAfter) { - if (IgnoreProfitability) return true; - - // AMBefore is the addressing mode before this instruction was folded into it, - // and AMAfter is the addressing mode after the instruction was folded. Get - // the set of registers referenced by AMAfter and subtract out those - // referenced by AMBefore: this is the set of values which folding in this - // address extends the lifetime of. - // - // Note that there are only two potential values being referenced here, - // BaseReg and ScaleReg (global addresses are always available, as are any - // folded immediates). - Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg; - - // If the BaseReg or ScaledReg was referenced by the previous addrmode, their - // lifetime wasn't extended by adding this instruction. - if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg)) - BaseReg = 0; - if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg)) - ScaledReg = 0; - - // If folding this instruction (and it's subexprs) didn't extend any live - // ranges, we're ok with it. - if (BaseReg == 0 && ScaledReg == 0) - return true; - - // If all uses of this instruction are ultimately load/store/inlineasm's, - // check to see if their addressing modes will include this instruction. If - // so, we can fold it into all uses, so it doesn't matter if it has multiple - // uses. - SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses; - SmallPtrSet<Instruction*, 16> ConsideredInsts; - if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI)) - return false; // Has a non-memory, non-foldable use! - - // Now that we know that all uses of this instruction are part of a chain of - // computation involving only operations that could theoretically be folded - // into a memory use, loop over each of these uses and see if they could - // *actually* fold the instruction. - SmallVector<Instruction*, 32> MatchedAddrModeInsts; - for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) { - Instruction *User = MemoryUses[i].first; - unsigned OpNo = MemoryUses[i].second; - - // Get the access type of this use. If the use isn't a pointer, we don't - // know what it accesses. - Value *Address = User->getOperand(OpNo); - if (!Address->getType()->isPointerTy()) - return false; - Type *AddressAccessTy = Address->getType()->getPointerElementType(); - - // Do a match against the root of this address, ignoring profitability. This - // will tell us if the addressing mode for the memory operation will - // *actually* cover the shared instruction. - ExtAddrMode Result; - AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy, - MemoryInst, Result); - Matcher.IgnoreProfitability = true; - bool Success = Matcher.MatchAddr(Address, 0); - (void)Success; assert(Success && "Couldn't select *anything*?"); - - // If the match didn't cover I, then it won't be shared by it. - if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(), - I) == MatchedAddrModeInsts.end()) - return false; - - MatchedAddrModeInsts.clear(); - } - - return true; -} - -} // end anonymous namespace - -/// IsNonLocalValue - Return true if the specified values are defined in a -/// different basic block than BB. -static bool IsNonLocalValue(Value *V, BasicBlock *BB) { - if (Instruction *I = dyn_cast<Instruction>(V)) - return I->getParent() != BB; - return false; -} - -/// OptimizeMemoryInst - Load and Store Instructions often have -/// addressing modes that can do significant amounts of computation. As such, -/// instruction selection will try to get the load or store to do as much -/// computation as possible for the program. The problem is that isel can only -/// see within a single block. As such, we sink as much legal addressing mode -/// stuff into the block as possible. -/// -/// This method is used to optimize both load/store and inline asms with memory -/// operands. -bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, - Type *AccessTy) { - Value *Repl = Addr; - - // Try to collapse single-value PHI nodes. This is necessary to undo - // unprofitable PRE transformations. - SmallVector<Value*, 8> worklist; - SmallPtrSet<Value*, 16> Visited; - worklist.push_back(Addr); - - // Use a worklist to iteratively look through PHI nodes, and ensure that - // the addressing mode obtained from the non-PHI roots of the graph - // are equivalent. - Value *Consensus = 0; - unsigned NumUsesConsensus = 0; - bool IsNumUsesConsensusValid = false; - SmallVector<Instruction*, 16> AddrModeInsts; - ExtAddrMode AddrMode; - while (!worklist.empty()) { - Value *V = worklist.back(); - worklist.pop_back(); - - // Break use-def graph loops. - if (!Visited.insert(V)) { - Consensus = 0; - break; - } - - // For a PHI node, push all of its incoming values. - if (PHINode *P = dyn_cast<PHINode>(V)) { - for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) - worklist.push_back(P->getIncomingValue(i)); - continue; - } - - // For non-PHIs, determine the addressing mode being computed. - SmallVector<Instruction*, 16> NewAddrModeInsts; - ExtAddrMode NewAddrMode = - AddressingModeMatcher::Match(V, AccessTy, MemoryInst, - NewAddrModeInsts, *TLI); - - // This check is broken into two cases with very similar code to avoid using - // getNumUses() as much as possible. Some values have a lot of uses, so - // calling getNumUses() unconditionally caused a significant compile-time - // regression. - if (!Consensus) { - Consensus = V; - AddrMode = NewAddrMode; - AddrModeInsts = NewAddrModeInsts; - continue; - } else if (NewAddrMode == AddrMode) { - if (!IsNumUsesConsensusValid) { - NumUsesConsensus = Consensus->getNumUses(); - IsNumUsesConsensusValid = true; - } - - // Ensure that the obtained addressing mode is equivalent to that obtained - // for all other roots of the PHI traversal. Also, when choosing one - // such root as representative, select the one with the most uses in order - // to keep the cost modeling heuristics in AddressingModeMatcher - // applicable. - unsigned NumUses = V->getNumUses(); - if (NumUses > NumUsesConsensus) { - Consensus = V; - NumUsesConsensus = NumUses; - AddrModeInsts = NewAddrModeInsts; - } - continue; - } - - Consensus = 0; - break; - } - - // If the addressing mode couldn't be determined, or if multiple different - // ones were determined, bail out now. - if (!Consensus) return false; - - // Check to see if any of the instructions supersumed by this addr mode are - // non-local to I's BB. - bool AnyNonLocal = false; - for (unsigned i = 0, e = AddrModeInsts.size(); i != e; ++i) { - if (IsNonLocalValue(AddrModeInsts[i], MemoryInst->getParent())) { - AnyNonLocal = true; - break; - } - } - - // If all the instructions matched are already in this BB, don't do anything. - if (!AnyNonLocal) { - DEBUG(dbgs() << "CGP: Found local addrmode: " << AddrMode << "\n"); - return false; - } - - // Insert this computation right after this user. Since our caller is - // scanning from the top of the BB to the bottom, reuse of the expr are - // guaranteed to happen later. - IRBuilder<> Builder(MemoryInst); - - // Now that we determined the addressing expression we want to use and know - // that we have to sink it into this block. Check to see if we have already - // done this for some other load/store instr in this block. If so, reuse the - // computation. - Value *&SunkAddr = SunkAddrs[Addr]; - if (SunkAddr) { - DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for " - << *MemoryInst); - if (SunkAddr->getType() != Addr->getType()) - SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); - } else { - DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " - << *MemoryInst); - Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType()); - Value *Result = 0; - - // Start with the base register. Do this first so that subsequent address - // matching finds it last, which will prevent it from trying to match it - // as the scaled value in case it happens to be a mul. That would be - // problematic if we've sunk a different mul for the scale, because then - // we'd end up sinking both muls. - if (AddrMode.BaseReg) { - Value *V = AddrMode.BaseReg; - if (V->getType()->isPointerTy()) - V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); - if (V->getType() != IntPtrTy) - V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr"); - Result = V; - } - - // Add the scale value. - if (AddrMode.Scale) { - Value *V = AddrMode.ScaledReg; - if (V->getType() == IntPtrTy) { - // done. - } else if (V->getType()->isPointerTy()) { - V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); - } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() < - cast<IntegerType>(V->getType())->getBitWidth()) { - V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr"); - } else { - V = Builder.CreateSExt(V, IntPtrTy, "sunkaddr"); - } - if (AddrMode.Scale != 1) - V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale), - "sunkaddr"); - if (Result) - Result = Builder.CreateAdd(Result, V, "sunkaddr"); - else - Result = V; - } - - // Add in the BaseGV if present. - if (AddrMode.BaseGV) { - Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr"); - if (Result) - Result = Builder.CreateAdd(Result, V, "sunkaddr"); - else - Result = V; - } - - // Add in the Base Offset if present. - if (AddrMode.BaseOffs) { - Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs); - if (Result) - Result = Builder.CreateAdd(Result, V, "sunkaddr"); - else - Result = V; - } - - if (Result == 0) - SunkAddr = Constant::getNullValue(Addr->getType()); - else - SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr"); - } - - MemoryInst->replaceUsesOfWith(Repl, SunkAddr); - - // If we have no uses, recursively delete the value and all dead instructions - // using it. - if (Repl->use_empty()) { - // This can cause recursive deletion, which can invalidate our iterator. - // Use a WeakVH to hold onto it in case this happens. - WeakVH IterHandle(CurInstIterator); - BasicBlock *BB = CurInstIterator->getParent(); - - RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo); - - if (IterHandle != CurInstIterator) { - // If the iterator instruction was recursively deleted, start over at the - // start of the block. - CurInstIterator = BB->begin(); - SunkAddrs.clear(); - } - } - ++NumMemoryInsts; - return true; -} - -/// OptimizeInlineAsmInst - If there are any memory operands, use -/// OptimizeMemoryInst to sink their address computing into the block when -/// possible / profitable. -bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) { - bool MadeChange = false; - - TargetLowering::AsmOperandInfoVector - TargetConstraints = TLI->ParseConstraints(CS); - unsigned ArgNo = 0; - for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { - TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; - - // Compute the constraint code and ConstraintType to use. - TLI->ComputeConstraintToUse(OpInfo, SDValue()); - - if (OpInfo.ConstraintType == TargetLowering::C_Memory && - OpInfo.isIndirect) { - Value *OpVal = CS->getArgOperand(ArgNo++); - MadeChange |= OptimizeMemoryInst(CS, OpVal, OpVal->getType()); - } else if (OpInfo.Type == InlineAsm::isInput) - ArgNo++; - } - - return MadeChange; -} - -/// MoveExtToFormExtLoad - Move a zext or sext fed by a load into the same -/// basic block as the load, unless conditions are unfavorable. This allows -/// SelectionDAG to fold the extend into the load. -/// -bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) { - // Look for a load being extended. - LoadInst *LI = dyn_cast<LoadInst>(I->getOperand(0)); - if (!LI) return false; - - // If they're already in the same block, there's nothing to do. - if (LI->getParent() == I->getParent()) - return false; - - // If the load has other users and the truncate is not free, this probably - // isn't worthwhile. - if (!LI->hasOneUse() && - TLI && (TLI->isTypeLegal(TLI->getValueType(LI->getType())) || - !TLI->isTypeLegal(TLI->getValueType(I->getType()))) && - !TLI->isTruncateFree(I->getType(), LI->getType())) - return false; - - // Check whether the target supports casts folded into loads. - unsigned LType; - if (isa<ZExtInst>(I)) - LType = ISD::ZEXTLOAD; - else { - assert(isa<SExtInst>(I) && "Unexpected ext type!"); - LType = ISD::SEXTLOAD; - } - if (TLI && !TLI->isLoadExtLegal(LType, TLI->getValueType(LI->getType()))) - return false; - - // Move the extend into the same block as the load, so that SelectionDAG - // can fold it. - I->removeFromParent(); - I->insertAfter(LI); - ++NumExtsMoved; - return true; -} - -bool CodeGenPrepare::OptimizeExtUses(Instruction *I) { - BasicBlock *DefBB = I->getParent(); - - // If the result of a {s|z}ext and its source are both live out, rewrite all - // other uses of the source with result of extension. - Value *Src = I->getOperand(0); - if (Src->hasOneUse()) - return false; - - // Only do this xform if truncating is free. - if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType())) - return false; - - // Only safe to perform the optimization if the source is also defined in - // this block. - if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent()) - return false; - - bool DefIsLiveOut = false; - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); - UI != E; ++UI) { - Instruction *User = cast<Instruction>(*UI); - - // Figure out which BB this ext is used in. - BasicBlock *UserBB = User->getParent(); - if (UserBB == DefBB) continue; - DefIsLiveOut = true; - break; - } - if (!DefIsLiveOut) - return false; - - // Make sure none of the uses are PHI nodes. - for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end(); - UI != E; ++UI) { - Instruction *User = cast<Instruction>(*UI); - BasicBlock *UserBB = User->getParent(); - if (UserBB == DefBB) continue; - // Be conservative. We don't want this xform to end up introducing - // reloads just before load / store instructions. - if (isa<PHINode>(User) || isa<LoadInst>(User) || isa<StoreInst>(User)) - return false; - } - - // InsertedTruncs - Only insert one trunc in each block once. - DenseMap<BasicBlock*, Instruction*> InsertedTruncs; - - bool MadeChange = false; - for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end(); - UI != E; ++UI) { - Use &TheUse = UI.getUse(); - Instruction *User = cast<Instruction>(*UI); - - // Figure out which BB this ext is used in. - BasicBlock *UserBB = User->getParent(); - if (UserBB == DefBB) continue; - - // Both src and def are live in this block. Rewrite the use. - Instruction *&InsertedTrunc = InsertedTruncs[UserBB]; - - if (!InsertedTrunc) { - BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); - InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt); - } - - // Replace a use of the {s|z}ext source with a use of the result. - TheUse = InsertedTrunc; - ++NumExtUses; - MadeChange = true; - } - - return MadeChange; -} - -/// isFormingBranchFromSelectProfitable - Returns true if a SelectInst should be -/// turned into an explicit branch. -static bool isFormingBranchFromSelectProfitable(SelectInst *SI) { - // FIXME: This should use the same heuristics as IfConversion to determine - // whether a select is better represented as a branch. This requires that - // branch probability metadata is preserved for the select, which is not the - // case currently. - - CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition()); - - // If the branch is predicted right, an out of order CPU can avoid blocking on - // the compare. Emit cmovs on compares with a memory operand as branches to - // avoid stalls on the load from memory. If the compare has more than one use - // there's probably another cmov or setcc around so it's not worth emitting a - // branch. - if (!Cmp) - return false; - - Value *CmpOp0 = Cmp->getOperand(0); - Value *CmpOp1 = Cmp->getOperand(1); - - // We check that the memory operand has one use to avoid uses of the loaded - // value directly after the compare, making branches unprofitable. - return Cmp->hasOneUse() && - ((isa<LoadInst>(CmpOp0) && CmpOp0->hasOneUse()) || - (isa<LoadInst>(CmpOp1) && CmpOp1->hasOneUse())); -} - - -/// If we have a SelectInst that will likely profit from branch prediction, -/// turn it into a branch. -bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) { - bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1); - - // Can we convert the 'select' to CF ? - if (DisableSelectToBranch || OptSize || !TLI || VectorCond) - return false; - - TargetLowering::SelectSupportKind SelectKind; - if (VectorCond) - SelectKind = TargetLowering::VectorMaskSelect; - else if (SI->getType()->isVectorTy()) - SelectKind = TargetLowering::ScalarCondVectorVal; - else - SelectKind = TargetLowering::ScalarValSelect; - - // Do we have efficient codegen support for this kind of 'selects' ? - if (TLI->isSelectSupported(SelectKind)) { - // We have efficient codegen support for the select instruction. - // Check if it is profitable to keep this 'select'. - if (!TLI->isPredictableSelectExpensive() || - !isFormingBranchFromSelectProfitable(SI)) - return false; - } - - ModifiedDT = true; - - // First, we split the block containing the select into 2 blocks. - BasicBlock *StartBlock = SI->getParent(); - BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(SI)); - BasicBlock *NextBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); - - // Create a new block serving as the landing pad for the branch. - BasicBlock *SmallBlock = BasicBlock::Create(SI->getContext(), "select.mid", - NextBlock->getParent(), NextBlock); - - // Move the unconditional branch from the block with the select in it into our - // landing pad block. - StartBlock->getTerminator()->eraseFromParent(); - BranchInst::Create(NextBlock, SmallBlock); - - // Insert the real conditional branch based on the original condition. - BranchInst::Create(NextBlock, SmallBlock, SI->getCondition(), SI); - - // The select itself is replaced with a PHI Node. - PHINode *PN = PHINode::Create(SI->getType(), 2, "", NextBlock->begin()); - PN->takeName(SI); - PN->addIncoming(SI->getTrueValue(), StartBlock); - PN->addIncoming(SI->getFalseValue(), SmallBlock); - SI->replaceAllUsesWith(PN); - SI->eraseFromParent(); - - // Instruct OptimizeBlock to skip to the next block. - CurInstIterator = StartBlock->end(); - ++NumSelectsExpanded; - return true; -} - -bool CodeGenPrepare::OptimizeInst(Instruction *I) { - if (PHINode *P = dyn_cast<PHINode>(I)) { - // It is possible for very late stage optimizations (such as SimplifyCFG) - // to introduce PHI nodes too late to be cleaned up. If we detect such a - // trivial PHI, go ahead and zap it here. - if (Value *V = SimplifyInstruction(P, TLI ? TLI->getDataLayout() : 0, - TLInfo, DT)) { - P->replaceAllUsesWith(V); - P->eraseFromParent(); - ++NumPHIsElim; - return true; - } - return false; - } - - if (CastInst *CI = dyn_cast<CastInst>(I)) { - // If the source of the cast is a constant, then this should have - // already been constant folded. The only reason NOT to constant fold - // it is if something (e.g. LSR) was careful to place the constant - // evaluation in a block other than then one that uses it (e.g. to hoist - // the address of globals out of a loop). If this is the case, we don't - // want to forward-subst the cast. - if (isa<Constant>(CI->getOperand(0))) - return false; - - if (TLI && OptimizeNoopCopyExpression(CI, *TLI)) - return true; - - if (isa<ZExtInst>(I) || isa<SExtInst>(I)) { - bool MadeChange = MoveExtToFormExtLoad(I); - return MadeChange | OptimizeExtUses(I); - } - return false; - } - - if (CmpInst *CI = dyn_cast<CmpInst>(I)) - return OptimizeCmpExpression(CI); - - if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - if (TLI) - return OptimizeMemoryInst(I, I->getOperand(0), LI->getType()); - return false; - } - - if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - if (TLI) - return OptimizeMemoryInst(I, SI->getOperand(1), - SI->getOperand(0)->getType()); - return false; - } - - if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { - if (GEPI->hasAllZeroIndices()) { - /// The GEP operand must be a pointer, so must its result -> BitCast - Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(), - GEPI->getName(), GEPI); - GEPI->replaceAllUsesWith(NC); - GEPI->eraseFromParent(); - ++NumGEPsElim; - OptimizeInst(NC); - return true; - } - return false; - } - - if (CallInst *CI = dyn_cast<CallInst>(I)) - return OptimizeCallInst(CI); - - if (SelectInst *SI = dyn_cast<SelectInst>(I)) - return OptimizeSelectInst(SI); - - return false; -} - -// In this pass we look for GEP and cast instructions that are used -// across basic blocks and rewrite them to improve basic-block-at-a-time -// selection. -bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) { - SunkAddrs.clear(); - bool MadeChange = false; - - CurInstIterator = BB.begin(); - while (CurInstIterator != BB.end()) - MadeChange |= OptimizeInst(CurInstIterator++); - - MadeChange |= DupRetToEnableTailCallOpts(&BB); - - return MadeChange; -} - -// llvm.dbg.value is far away from the value then iSel may not be able -// handle it properly. iSel will drop llvm.dbg.value if it can not -// find a node corresponding to the value. -bool CodeGenPrepare::PlaceDbgValues(Function &F) { - bool MadeChange = false; - for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { - Instruction *PrevNonDbgInst = NULL; - for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) { - Instruction *Insn = BI; ++BI; - DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn); - if (!DVI) { - PrevNonDbgInst = Insn; - continue; - } - - Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue()); - if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) { - DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI); - DVI->removeFromParent(); - if (isa<PHINode>(VI)) - DVI->insertBefore(VI->getParent()->getFirstInsertionPt()); - else - DVI->insertAfter(VI); - MadeChange = true; - ++NumDbgValueMoved; - } - } - } - return MadeChange; -} diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp new file mode 100644 index 0000000..763d02b --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -0,0 +1,602 @@ +//===- ConstantHoisting.cpp - Prepare code for expensive constants --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass identifies expensive constants to hoist and coalesces them to +// better prepare it for SelectionDAG-based code generation. This works around +// the limitations of the basic-block-at-a-time approach. +// +// First it scans all instructions for integer constants and calculates its +// cost. If the constant can be folded into the instruction (the cost is +// TCC_Free) or the cost is just a simple operation (TCC_BASIC), then we don't +// consider it expensive and leave it alone. This is the default behavior and +// the default implementation of getIntImmCost will always return TCC_Free. +// +// If the cost is more than TCC_BASIC, then the integer constant can't be folded +// into the instruction and it might be beneficial to hoist the constant. +// Similar constants are coalesced to reduce register pressure and +// materialization code. +// +// When a constant is hoisted, it is also hidden behind a bitcast to force it to +// be live-out of the basic block. Otherwise the constant would be just +// duplicated and each basic block would have its own copy in the SelectionDAG. +// The SelectionDAG recognizes such constants as opaque and doesn't perform +// certain transformations on them, which would create a new expensive constant. +// +// This optimization is only applied to integer constants in instructions and +// simple (this means not nested) constant cast expressions. For example: +// %0 = load i64* inttoptr (i64 big_constant to i64*) +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include <tuple> + +using namespace llvm; + +#define DEBUG_TYPE "consthoist" + +STATISTIC(NumConstantsHoisted, "Number of constants hoisted"); +STATISTIC(NumConstantsRebased, "Number of constants rebased"); + +namespace { +struct ConstantUser; +struct RebasedConstantInfo; + +typedef SmallVector<ConstantUser, 8> ConstantUseListType; +typedef SmallVector<RebasedConstantInfo, 4> RebasedConstantListType; + +/// \brief Keeps track of the user of a constant and the operand index where the +/// constant is used. +struct ConstantUser { + Instruction *Inst; + unsigned OpndIdx; + + ConstantUser(Instruction *Inst, unsigned Idx) : Inst(Inst), OpndIdx(Idx) { } +}; + +/// \brief Keeps track of a constant candidate and its uses. +struct ConstantCandidate { + ConstantUseListType Uses; + ConstantInt *ConstInt; + unsigned CumulativeCost; + + ConstantCandidate(ConstantInt *ConstInt) + : ConstInt(ConstInt), CumulativeCost(0) { } + + /// \brief Add the user to the use list and update the cost. + void addUser(Instruction *Inst, unsigned Idx, unsigned Cost) { + CumulativeCost += Cost; + Uses.push_back(ConstantUser(Inst, Idx)); + } +}; + +/// \brief This represents a constant that has been rebased with respect to a +/// base constant. The difference to the base constant is recorded in Offset. +struct RebasedConstantInfo { + ConstantUseListType Uses; + Constant *Offset; + + RebasedConstantInfo(ConstantUseListType &&Uses, Constant *Offset) + : Uses(Uses), Offset(Offset) { } +}; + +/// \brief A base constant and all its rebased constants. +struct ConstantInfo { + ConstantInt *BaseConstant; + RebasedConstantListType RebasedConstants; +}; + +/// \brief The constant hoisting pass. +class ConstantHoisting : public FunctionPass { + typedef DenseMap<ConstantInt *, unsigned> ConstCandMapType; + typedef std::vector<ConstantCandidate> ConstCandVecType; + + const TargetTransformInfo *TTI; + DominatorTree *DT; + BasicBlock *Entry; + + /// Keeps track of constant candidates found in the function. + ConstCandVecType ConstCandVec; + + /// Keep track of cast instructions we already cloned. + SmallDenseMap<Instruction *, Instruction *> ClonedCastMap; + + /// These are the final constants we decided to hoist. + SmallVector<ConstantInfo, 8> ConstantVec; +public: + static char ID; // Pass identification, replacement for typeid + ConstantHoisting() : FunctionPass(ID), TTI(nullptr), DT(nullptr), + Entry(nullptr) { + initializeConstantHoistingPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &Fn) override; + + const char *getPassName() const override { return "Constant Hoisting"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<TargetTransformInfo>(); + } + +private: + /// \brief Initialize the pass. + void setup(Function &Fn) { + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + TTI = &getAnalysis<TargetTransformInfo>(); + Entry = &Fn.getEntryBlock(); + } + + /// \brief Cleanup. + void cleanup() { + ConstantVec.clear(); + ClonedCastMap.clear(); + ConstCandVec.clear(); + + TTI = nullptr; + DT = nullptr; + Entry = nullptr; + } + + Instruction *findMatInsertPt(Instruction *Inst, unsigned Idx = ~0U) const; + Instruction *findConstantInsertionPoint(const ConstantInfo &ConstInfo) const; + void collectConstantCandidates(ConstCandMapType &ConstCandMap, + Instruction *Inst, unsigned Idx, + ConstantInt *ConstInt); + void collectConstantCandidates(ConstCandMapType &ConstCandMap, + Instruction *Inst); + void collectConstantCandidates(Function &Fn); + void findAndMakeBaseConstant(ConstCandVecType::iterator S, + ConstCandVecType::iterator E); + void findBaseConstants(); + void emitBaseConstants(Instruction *Base, Constant *Offset, + const ConstantUser &ConstUser); + bool emitBaseConstants(); + void deleteDeadCastInst() const; + bool optimizeConstants(Function &Fn); +}; +} + +char ConstantHoisting::ID = 0; +INITIALIZE_PASS_BEGIN(ConstantHoisting, "consthoist", "Constant Hoisting", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_END(ConstantHoisting, "consthoist", "Constant Hoisting", + false, false) + +FunctionPass *llvm::createConstantHoistingPass() { + return new ConstantHoisting(); +} + +/// \brief Perform the constant hoisting optimization for the given function. +bool ConstantHoisting::runOnFunction(Function &Fn) { + DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n"); + DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n'); + + setup(Fn); + + bool MadeChange = optimizeConstants(Fn); + + if (MadeChange) { + DEBUG(dbgs() << "********** Function after Constant Hoisting: " + << Fn.getName() << '\n'); + DEBUG(dbgs() << Fn); + } + DEBUG(dbgs() << "********** End Constant Hoisting **********\n"); + + cleanup(); + + return MadeChange; +} + + +/// \brief Find the constant materialization insertion point. +Instruction *ConstantHoisting::findMatInsertPt(Instruction *Inst, + unsigned Idx) const { + // If the operand is a cast instruction, then we have to materialize the + // constant before the cast instruction. + if (Idx != ~0U) { + Value *Opnd = Inst->getOperand(Idx); + if (auto CastInst = dyn_cast<Instruction>(Opnd)) + if (CastInst->isCast()) + return CastInst; + } + + // The simple and common case. This also includes constant expressions. + if (!isa<PHINode>(Inst) && !isa<LandingPadInst>(Inst)) + return Inst; + + // We can't insert directly before a phi node or landing pad. Insert before + // the terminator of the incoming or dominating block. + assert(Entry != Inst->getParent() && "PHI or landing pad in entry block!"); + if (Idx != ~0U && isa<PHINode>(Inst)) + return cast<PHINode>(Inst)->getIncomingBlock(Idx)->getTerminator(); + + BasicBlock *IDom = DT->getNode(Inst->getParent())->getIDom()->getBlock(); + return IDom->getTerminator(); +} + +/// \brief Find an insertion point that dominates all uses. +Instruction *ConstantHoisting:: +findConstantInsertionPoint(const ConstantInfo &ConstInfo) const { + assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry."); + // Collect all basic blocks. + SmallPtrSet<BasicBlock *, 8> BBs; + for (auto const &RCI : ConstInfo.RebasedConstants) + for (auto const &U : RCI.Uses) + BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent()); + + if (BBs.count(Entry)) + return &Entry->front(); + + while (BBs.size() >= 2) { + BasicBlock *BB, *BB1, *BB2; + BB1 = *BBs.begin(); + BB2 = *std::next(BBs.begin()); + BB = DT->findNearestCommonDominator(BB1, BB2); + if (BB == Entry) + return &Entry->front(); + BBs.erase(BB1); + BBs.erase(BB2); + BBs.insert(BB); + } + assert((BBs.size() == 1) && "Expected only one element."); + Instruction &FirstInst = (*BBs.begin())->front(); + return findMatInsertPt(&FirstInst); +} + + +/// \brief Record constant integer ConstInt for instruction Inst at operand +/// index Idx. +/// +/// The operand at index Idx is not necessarily the constant integer itself. It +/// could also be a cast instruction or a constant expression that uses the +// constant integer. +void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap, + Instruction *Inst, + unsigned Idx, + ConstantInt *ConstInt) { + unsigned Cost; + // Ask the target about the cost of materializing the constant for the given + // instruction and operand index. + if (auto IntrInst = dyn_cast<IntrinsicInst>(Inst)) + Cost = TTI->getIntImmCost(IntrInst->getIntrinsicID(), Idx, + ConstInt->getValue(), ConstInt->getType()); + else + Cost = TTI->getIntImmCost(Inst->getOpcode(), Idx, ConstInt->getValue(), + ConstInt->getType()); + + // Ignore cheap integer constants. + if (Cost > TargetTransformInfo::TCC_Basic) { + ConstCandMapType::iterator Itr; + bool Inserted; + std::tie(Itr, Inserted) = ConstCandMap.insert(std::make_pair(ConstInt, 0)); + if (Inserted) { + ConstCandVec.push_back(ConstantCandidate(ConstInt)); + Itr->second = ConstCandVec.size() - 1; + } + ConstCandVec[Itr->second].addUser(Inst, Idx, Cost); + DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx))) + dbgs() << "Collect constant " << *ConstInt << " from " << *Inst + << " with cost " << Cost << '\n'; + else + dbgs() << "Collect constant " << *ConstInt << " indirectly from " + << *Inst << " via " << *Inst->getOperand(Idx) << " with cost " + << Cost << '\n'; + ); + } +} + +/// \brief Scan the instruction for expensive integer constants and record them +/// in the constant candidate vector. +void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap, + Instruction *Inst) { + // Skip all cast instructions. They are visited indirectly later on. + if (Inst->isCast()) + return; + + // Can't handle inline asm. Skip it. + if (auto Call = dyn_cast<CallInst>(Inst)) + if (isa<InlineAsm>(Call->getCalledValue())) + return; + + // Scan all operands. + for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) { + Value *Opnd = Inst->getOperand(Idx); + + // Visit constant integers. + if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) { + collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); + continue; + } + + // Visit cast instructions that have constant integers. + if (auto CastInst = dyn_cast<Instruction>(Opnd)) { + // Only visit cast instructions, which have been skipped. All other + // instructions should have already been visited. + if (!CastInst->isCast()) + continue; + + if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) { + // Pretend the constant is directly used by the instruction and ignore + // the cast instruction. + collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); + continue; + } + } + + // Visit constant expressions that have constant integers. + if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) { + // Only visit constant cast expressions. + if (!ConstExpr->isCast()) + continue; + + if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) { + // Pretend the constant is directly used by the instruction and ignore + // the constant expression. + collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); + continue; + } + } + } // end of for all operands +} + +/// \brief Collect all integer constants in the function that cannot be folded +/// into an instruction itself. +void ConstantHoisting::collectConstantCandidates(Function &Fn) { + ConstCandMapType ConstCandMap; + for (Function::iterator BB : Fn) + for (BasicBlock::iterator Inst : *BB) + collectConstantCandidates(ConstCandMap, Inst); +} + +/// \brief Find the base constant within the given range and rebase all other +/// constants with respect to the base constant. +void ConstantHoisting::findAndMakeBaseConstant(ConstCandVecType::iterator S, + ConstCandVecType::iterator E) { + auto MaxCostItr = S; + unsigned NumUses = 0; + // Use the constant that has the maximum cost as base constant. + for (auto ConstCand = S; ConstCand != E; ++ConstCand) { + NumUses += ConstCand->Uses.size(); + if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost) + MaxCostItr = ConstCand; + } + + // Don't hoist constants that have only one use. + if (NumUses <= 1) + return; + + ConstantInfo ConstInfo; + ConstInfo.BaseConstant = MaxCostItr->ConstInt; + Type *Ty = ConstInfo.BaseConstant->getType(); + + // Rebase the constants with respect to the base constant. + for (auto ConstCand = S; ConstCand != E; ++ConstCand) { + APInt Diff = ConstCand->ConstInt->getValue() - + ConstInfo.BaseConstant->getValue(); + Constant *Offset = Diff == 0 ? nullptr : ConstantInt::get(Ty, Diff); + ConstInfo.RebasedConstants.push_back( + RebasedConstantInfo(std::move(ConstCand->Uses), Offset)); + } + ConstantVec.push_back(ConstInfo); +} + +/// \brief Finds and combines constant candidates that can be easily +/// rematerialized with an add from a common base constant. +void ConstantHoisting::findBaseConstants() { + // Sort the constants by value and type. This invalidates the mapping! + std::sort(ConstCandVec.begin(), ConstCandVec.end(), + [](const ConstantCandidate &LHS, const ConstantCandidate &RHS) { + if (LHS.ConstInt->getType() != RHS.ConstInt->getType()) + return LHS.ConstInt->getType()->getBitWidth() < + RHS.ConstInt->getType()->getBitWidth(); + return LHS.ConstInt->getValue().ult(RHS.ConstInt->getValue()); + }); + + // Simple linear scan through the sorted constant candidate vector for viable + // merge candidates. + auto MinValItr = ConstCandVec.begin(); + for (auto CC = std::next(ConstCandVec.begin()), E = ConstCandVec.end(); + CC != E; ++CC) { + if (MinValItr->ConstInt->getType() == CC->ConstInt->getType()) { + // Check if the constant is in range of an add with immediate. + APInt Diff = CC->ConstInt->getValue() - MinValItr->ConstInt->getValue(); + if ((Diff.getBitWidth() <= 64) && + TTI->isLegalAddImmediate(Diff.getSExtValue())) + continue; + } + // We either have now a different constant type or the constant is not in + // range of an add with immediate anymore. + findAndMakeBaseConstant(MinValItr, CC); + // Start a new base constant search. + MinValItr = CC; + } + // Finalize the last base constant search. + findAndMakeBaseConstant(MinValItr, ConstCandVec.end()); +} + +/// \brief Updates the operand at Idx in instruction Inst with the result of +/// instruction Mat. If the instruction is a PHI node then special +/// handling for duplicate values form the same incomming basic block is +/// required. +/// \return The update will always succeed, but the return value indicated if +/// Mat was used for the update or not. +static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat) { + if (auto PHI = dyn_cast<PHINode>(Inst)) { + // Check if any previous operand of the PHI node has the same incoming basic + // block. This is a very odd case that happens when the incoming basic block + // has a switch statement. In this case use the same value as the previous + // operand(s), otherwise we will fail verification due to different values. + // The values are actually the same, but the variable names are different + // and the verifier doesn't like that. + BasicBlock *IncomingBB = PHI->getIncomingBlock(Idx); + for (unsigned i = 0; i < Idx; ++i) { + if (PHI->getIncomingBlock(i) == IncomingBB) { + Value *IncomingVal = PHI->getIncomingValue(i); + Inst->setOperand(Idx, IncomingVal); + return false; + } + } + } + + Inst->setOperand(Idx, Mat); + return true; +} + +/// \brief Emit materialization code for all rebased constants and update their +/// users. +void ConstantHoisting::emitBaseConstants(Instruction *Base, Constant *Offset, + const ConstantUser &ConstUser) { + Instruction *Mat = Base; + if (Offset) { + Instruction *InsertionPt = findMatInsertPt(ConstUser.Inst, + ConstUser.OpndIdx); + Mat = BinaryOperator::Create(Instruction::Add, Base, Offset, + "const_mat", InsertionPt); + + DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0) + << " + " << *Offset << ") in BB " + << Mat->getParent()->getName() << '\n' << *Mat << '\n'); + Mat->setDebugLoc(ConstUser.Inst->getDebugLoc()); + } + Value *Opnd = ConstUser.Inst->getOperand(ConstUser.OpndIdx); + + // Visit constant integer. + if (isa<ConstantInt>(Opnd)) { + DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n'); + if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat) && Offset) + Mat->eraseFromParent(); + DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n'); + return; + } + + // Visit cast instruction. + if (auto CastInst = dyn_cast<Instruction>(Opnd)) { + assert(CastInst->isCast() && "Expected an cast instruction!"); + // Check if we already have visited this cast instruction before to avoid + // unnecessary cloning. + Instruction *&ClonedCastInst = ClonedCastMap[CastInst]; + if (!ClonedCastInst) { + ClonedCastInst = CastInst->clone(); + ClonedCastInst->setOperand(0, Mat); + ClonedCastInst->insertAfter(CastInst); + // Use the same debug location as the original cast instruction. + ClonedCastInst->setDebugLoc(CastInst->getDebugLoc()); + DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n' + << "To : " << *ClonedCastInst << '\n'); + } + + DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n'); + updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ClonedCastInst); + DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n'); + return; + } + + // Visit constant expression. + if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) { + Instruction *ConstExprInst = ConstExpr->getAsInstruction(); + ConstExprInst->setOperand(0, Mat); + ConstExprInst->insertBefore(findMatInsertPt(ConstUser.Inst, + ConstUser.OpndIdx)); + + // Use the same debug location as the instruction we are about to update. + ConstExprInst->setDebugLoc(ConstUser.Inst->getDebugLoc()); + + DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n' + << "From : " << *ConstExpr << '\n'); + DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n'); + if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ConstExprInst)) { + ConstExprInst->eraseFromParent(); + if (Offset) + Mat->eraseFromParent(); + } + DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n'); + return; + } +} + +/// \brief Hoist and hide the base constant behind a bitcast and emit +/// materialization code for derived constants. +bool ConstantHoisting::emitBaseConstants() { + bool MadeChange = false; + for (auto const &ConstInfo : ConstantVec) { + // Hoist and hide the base constant behind a bitcast. + Instruction *IP = findConstantInsertionPoint(ConstInfo); + IntegerType *Ty = ConstInfo.BaseConstant->getType(); + Instruction *Base = + new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP); + DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant << ") to BB " + << IP->getParent()->getName() << '\n' << *Base << '\n'); + NumConstantsHoisted++; + + // Emit materialization code for all rebased constants. + for (auto const &RCI : ConstInfo.RebasedConstants) { + NumConstantsRebased++; + for (auto const &U : RCI.Uses) + emitBaseConstants(Base, RCI.Offset, U); + } + + // Use the same debug location as the last user of the constant. + assert(!Base->use_empty() && "The use list is empty!?"); + assert(isa<Instruction>(Base->user_back()) && + "All uses should be instructions."); + Base->setDebugLoc(cast<Instruction>(Base->user_back())->getDebugLoc()); + + // Correct for base constant, which we counted above too. + NumConstantsRebased--; + MadeChange = true; + } + return MadeChange; +} + +/// \brief Check all cast instructions we made a copy of and remove them if they +/// have no more users. +void ConstantHoisting::deleteDeadCastInst() const { + for (auto const &I : ClonedCastMap) + if (I.first->use_empty()) + I.first->eraseFromParent(); +} + +/// \brief Optimize expensive integer constants in the given function. +bool ConstantHoisting::optimizeConstants(Function &Fn) { + // Collect all constant candidates. + collectConstantCandidates(Fn); + + // There are no constant candidates to worry about. + if (ConstCandVec.empty()) + return false; + + // Combine constants that can be easily materialized with an add from a common + // base constant. + findBaseConstants(); + + // There are no constants to emit. + if (ConstantVec.empty()) + return false; + + // Finally hoist the base constant and emit materialization code for dependent + // constants. + bool MadeChange = emitBaseConstants(); + + // Cleanup dead instructions. + deleteDeadCastInst(); + + return MadeChange; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp index d5a96ec..dd51ce1 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp @@ -18,19 +18,20 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "constprop" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/IR/Constant.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/Pass.h" -#include "llvm/Support/InstIterator.h" #include "llvm/Target/TargetLibraryInfo.h" #include <set> using namespace llvm; +#define DEBUG_TYPE "constprop" + STATISTIC(NumInstKilled, "Number of instructions killed"); namespace { @@ -40,9 +41,9 @@ namespace { initializeConstantPropagationPass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<TargetLibraryInfo>(); } @@ -67,7 +68,8 @@ bool ConstantPropagation::runOnFunction(Function &F) { WorkList.insert(&*i); } bool Changed = false; - DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); while (!WorkList.empty()) { @@ -75,12 +77,11 @@ bool ConstantPropagation::runOnFunction(Function &F) { WorkList.erase(WorkList.begin()); // Get an element from the worklist... if (!I->use_empty()) // Don't muck with dead instructions... - if (Constant *C = ConstantFoldInstruction(I, TD, TLI)) { + if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) { // Add all of the users of this instruction to the worklist, they might // be constant propagatable now... - for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); - UI != UE; ++UI) - WorkList.insert(cast<Instruction>(*UI)); + for (User *U : I->users()) + WorkList.insert(cast<Instruction>(U)); // Replace all of the uses of a variable with uses of the constant. I->replaceAllUsesWith(C); diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 995782e..0829462 100644 --- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -11,21 +11,22 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "correlated-value-propagation" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/Pass.h" -#include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "correlated-value-propagation" + STATISTIC(NumPhis, "Number of phis propagated"); STATISTIC(NumSelects, "Number of selects propagated"); STATISTIC(NumMemAccess, "Number of memory access targets propagated"); @@ -48,9 +49,9 @@ namespace { initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LazyValueInfo>(); } }; @@ -138,7 +139,7 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) { } bool CorrelatedValuePropagation::processMemAccess(Instruction *I) { - Value *Pointer = 0; + Value *Pointer = nullptr; if (LoadInst *L = dyn_cast<LoadInst>(I)) Pointer = L->getPointerOperand(); else @@ -281,6 +282,9 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) { } bool CorrelatedValuePropagation::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + LVI = &getAnalysis<LazyValueInfo>(); bool FnChanged = false; diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp index e8a090a..99fac75 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp @@ -16,16 +16,17 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "dce" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/Pass.h" -#include "llvm/Support/InstIterator.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "dce" + STATISTIC(DIEEliminated, "Number of insts removed by DIE pass"); STATISTIC(DCEEliminated, "Number of insts removed"); @@ -38,7 +39,9 @@ namespace { DeadInstElimination() : BasicBlockPass(ID) { initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnBasicBlock(BasicBlock &BB) { + bool runOnBasicBlock(BasicBlock &BB) override { + if (skipOptnoneFunction(BB)) + return false; TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); bool Changed = false; for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { @@ -52,7 +55,7 @@ namespace { return Changed; } - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } }; @@ -77,9 +80,9 @@ namespace { initializeDCEPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } }; @@ -89,6 +92,9 @@ char DCE::ID = 0; INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false) bool DCE::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); // Start out with all of the instructions in the worklist... diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 57432c7..3af8ee7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -15,19 +15,18 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "dse" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CaptureTracking.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" @@ -38,6 +37,8 @@ #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "dse" + STATISTIC(NumFastStores, "Number of stores deleted"); STATISTIC(NumFastOther , "Number of other instrs removed"); @@ -49,14 +50,17 @@ namespace { const TargetLibraryInfo *TLI; static char ID; // Pass identification, replacement for typeid - DSE() : FunctionPass(ID), AA(0), MD(0), DT(0) { + DSE() : FunctionPass(ID), AA(nullptr), MD(nullptr), DT(nullptr) { initializeDSEPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnFunction(Function &F) { + bool runOnFunction(Function &F) override { + if (skipOptnoneFunction(F)) + return false; + AA = &getAnalysis<AliasAnalysis>(); MD = &getAnalysis<MemoryDependenceAnalysis>(); - DT = &getAnalysis<DominatorTree>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); TLI = AA->getTargetLibraryInfo(); bool Changed = false; @@ -66,7 +70,7 @@ namespace { if (DT->isReachableFromEntry(I)) Changed |= runOnBasicBlock(*I); - AA = 0; MD = 0; DT = 0; + AA = nullptr; MD = nullptr; DT = nullptr; return Changed; } @@ -76,13 +80,13 @@ namespace { void RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, SmallSetVector<Value*, 16> &DeadStackObjects); - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<AliasAnalysis>(); AU.addRequired<MemoryDependenceAnalysis>(); AU.addPreserved<AliasAnalysis>(); - AU.addPreserved<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<MemoryDependenceAnalysis>(); } }; @@ -90,7 +94,7 @@ namespace { char DSE::ID = 0; INITIALIZE_PASS_BEGIN(DSE, "dse", "Dead Store Elimination", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(DSE, "dse", "Dead Store Elimination", false, false) @@ -108,9 +112,9 @@ FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); } /// If ValueSet is non-null, remove any deleted instructions from it as well. /// static void DeleteDeadInstruction(Instruction *I, - MemoryDependenceAnalysis &MD, - const TargetLibraryInfo *TLI, - SmallSetVector<Value*, 16> *ValueSet = 0) { + MemoryDependenceAnalysis &MD, + const TargetLibraryInfo *TLI, + SmallSetVector<Value*, 16> *ValueSet = nullptr) { SmallVector<Instruction*, 32> NowDeadInsts; NowDeadInsts.push_back(I); @@ -128,7 +132,7 @@ static void DeleteDeadInstruction(Instruction *I, for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { Value *Op = DeadInst->getOperand(op); - DeadInst->setOperand(op, 0); + DeadInst->setOperand(op, nullptr); // If this operand just became dead, add it to the NowDeadInsts list. if (!Op->use_empty()) continue; @@ -190,6 +194,7 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) { /// describe the memory operations for this instruction. static AliasAnalysis::Location getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { + const DataLayout *DL = AA.getDataLayout(); if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) return AA.getLocation(SI); @@ -199,13 +204,13 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { // If we don't have target data around, an unknown size in Location means // that we should use the size of the pointee type. This isn't valid for // memset/memcpy, which writes more than an i8. - if (Loc.Size == AliasAnalysis::UnknownSize && AA.getDataLayout() == 0) + if (Loc.Size == AliasAnalysis::UnknownSize && DL == nullptr) return AliasAnalysis::Location(); return Loc; } IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); - if (II == 0) return AliasAnalysis::Location(); + if (!II) return AliasAnalysis::Location(); switch (II->getIntrinsicID()) { default: return AliasAnalysis::Location(); // Unhandled intrinsic. @@ -213,7 +218,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { // If we don't have target data around, an unknown size in Location means // that we should use the size of the pointee type. This isn't valid for // init.trampoline, which writes more than an i8. - if (AA.getDataLayout() == 0) return AliasAnalysis::Location(); + if (!DL) return AliasAnalysis::Location(); // FIXME: We don't know the size of the trampoline, so we can't really // handle it here. @@ -341,6 +346,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, AliasAnalysis &AA, int64_t &EarlierOff, int64_t &LaterOff) { + const DataLayout *DL = AA.getDataLayout(); const Value *P1 = Earlier.Ptr->stripPointerCasts(); const Value *P2 = Later.Ptr->stripPointerCasts(); @@ -354,8 +360,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, // If we have no DataLayout information around, then the size of the store // is inferrable from the pointee type. If they are the same type, then // we know that the store is safe. - if (AA.getDataLayout() == 0 && - Later.Ptr->getType() == Earlier.Ptr->getType()) + if (DL == nullptr && Later.Ptr->getType() == Earlier.Ptr->getType()) return OverwriteComplete; return OverwriteUnknown; @@ -369,17 +374,14 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, // Otherwise, we have to have size information, and the later store has to be // larger than the earlier one. if (Later.Size == AliasAnalysis::UnknownSize || - Earlier.Size == AliasAnalysis::UnknownSize || - AA.getDataLayout() == 0) + Earlier.Size == AliasAnalysis::UnknownSize || DL == nullptr) return OverwriteUnknown; // Check to see if the later store is to the entire object (either a global, - // an alloca, or a byval argument). If so, then it clearly overwrites any - // other store to the same object. - const DataLayout *TD = AA.getDataLayout(); - - const Value *UO1 = GetUnderlyingObject(P1, TD), - *UO2 = GetUnderlyingObject(P2, TD); + // an alloca, or a byval/inalloca argument). If so, then it clearly + // overwrites any other store to the same object. + const Value *UO1 = GetUnderlyingObject(P1, DL), + *UO2 = GetUnderlyingObject(P2, DL); // If we can't resolve the same pointers to the same object, then we can't // analyze them at all. @@ -397,8 +399,8 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, // pointers are equal, then we can reason about the two stores. EarlierOff = 0; LaterOff = 0; - const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, TD); - const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, TD); + const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, DL); + const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, DL); // If the base pointers still differ, we have two completely different stores. if (BP1 != BP2) @@ -460,7 +462,7 @@ static bool isPossibleSelfRead(Instruction *Inst, // Self reads can only happen for instructions that read memory. Get the // location read. AliasAnalysis::Location InstReadLoc = getLocForRead(Inst, AA); - if (InstReadLoc.Ptr == 0) return false; // Not a reading instruction. + if (!InstReadLoc.Ptr) return false; // Not a reading instruction. // If the read and written loc obviously don't alias, it isn't a read. if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false; @@ -527,7 +529,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { DeleteDeadInstruction(SI, *MD, TLI); - if (NextInst == 0) // Next instruction deleted. + if (!NextInst) // Next instruction deleted. BBI = BB.begin(); else if (BBI != BB.begin()) // Revisit this instruction if possible. --BBI; @@ -542,7 +544,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { AliasAnalysis::Location Loc = getLocForWrite(Inst, *AA); // If we didn't get a useful location, fail. - if (Loc.Ptr == 0) + if (!Loc.Ptr) continue; while (InstDep.isDef() || InstDep.isClobber()) { @@ -556,7 +558,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { Instruction *DepWrite = InstDep.getInst(); AliasAnalysis::Location DepLoc = getLocForWrite(DepWrite, *AA); // If we didn't get a useful location, or if it isn't a size, bail out. - if (DepLoc.Ptr == 0) + if (!DepLoc.Ptr) break; // If we find a write that is a) removable (i.e., non-volatile), b) is @@ -679,7 +681,7 @@ bool DSE::HandleFree(CallInst *F) { if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) break; - Instruction *Next = llvm::next(BasicBlock::iterator(Dependency)); + Instruction *Next = std::next(BasicBlock::iterator(Dependency)); // DCE instructions only used to calculate that store DeleteDeadInstruction(Dependency, *MD, TLI); @@ -701,22 +703,6 @@ bool DSE::HandleFree(CallInst *F) { return MadeChange; } -namespace { - struct CouldRef { - typedef Value *argument_type; - const CallSite CS; - AliasAnalysis *AA; - - bool operator()(Value *I) { - // See if the call site touches the value. - AliasAnalysis::ModRefResult A = - AA->getModRefInfo(CS, I, getPointerSize(I, *AA)); - - return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref; - } - }; -} - /// handleEndBlock - Remove dead stores to stack-allocated locations in the /// function end block. Ex: /// %A = alloca i32 @@ -742,11 +728,11 @@ bool DSE::handleEndBlock(BasicBlock &BB) { DeadStackObjects.insert(I); } - // Treat byval arguments the same, stores to them are dead at the end of the - // function. + // Treat byval or inalloca arguments the same, stores to them are dead at the + // end of the function. for (Function::arg_iterator AI = BB.getParent()->arg_begin(), AE = BB.getParent()->arg_end(); AI != AE; ++AI) - if (AI->hasByValAttr()) + if (AI->hasByValOrInAllocaAttr()) DeadStackObjects.insert(AI); // Scan the basic block backwards @@ -776,7 +762,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(), E = Pointers.end(); I != E; ++I) { dbgs() << **I; - if (llvm::next(I) != E) + if (std::next(I) != E) dbgs() << ", "; } dbgs() << '\n'); @@ -818,8 +804,13 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // If the call might load from any of our allocas, then any store above // the call is live. - CouldRef Pred = { CS, AA }; - DeadStackObjects.remove_if(Pred); + DeadStackObjects.remove_if([&](Value *I) { + // See if the call site touches the value. + AliasAnalysis::ModRefResult A = + AA->getModRefInfo(CS, I, getPointerSize(I, *AA)); + + return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref; + }); // If all of the allocas were clobbered by the call then we're not going // to find anything else to process. @@ -862,20 +853,6 @@ bool DSE::handleEndBlock(BasicBlock &BB) { return MadeChange; } -namespace { - struct CouldAlias { - typedef Value *argument_type; - const AliasAnalysis::Location &LoadedLoc; - AliasAnalysis *AA; - - bool operator()(Value *I) { - // See if the loaded location could alias the stack location. - AliasAnalysis::Location StackLoc(I, getPointerSize(I, *AA)); - return !AA->isNoAlias(StackLoc, LoadedLoc); - } - }; -} - /// RemoveAccessedObjects - Check to see if the specified location may alias any /// of the stack objects in the DeadStackObjects set. If so, they become live /// because the location is being loaded. @@ -895,6 +872,9 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, } // Remove objects that could alias LoadedLoc. - CouldAlias Pred = { LoadedLoc, AA }; - DeadStackObjects.remove_if(Pred); + DeadStackObjects.remove_if([&](Value *I) { + // See if the loaded location could alias the stack location. + AliasAnalysis::Location StackLoc(I, getPointerSize(I, *AA)); + return !AA->isNoAlias(StackLoc, LoadedLoc); + }); } diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 5266894..735f5c1 100644 --- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -12,23 +12,24 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "early-cse" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/RecyclingAllocator.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" -#include <deque> +#include <vector> using namespace llvm; +#define DEBUG_TYPE "early-cse" + STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd"); STATISTIC(NumCSE, "Number of instructions CSE'd"); STATISTIC(NumCSELoad, "Number of load instructions CSE'd"); @@ -207,7 +208,7 @@ namespace { return false; CallInst *CI = dyn_cast<CallInst>(Inst); - if (CI == 0 || !CI->onlyReadsMemory()) + if (!CI || !CI->onlyReadsMemory()) return false; return true; } @@ -262,7 +263,7 @@ namespace { /// cases. class EarlyCSE : public FunctionPass { public: - const DataLayout *TD; + const DataLayout *DL; const TargetLibraryInfo *TLI; DominatorTree *DT; typedef RecyclingAllocator<BumpPtrAllocator, @@ -303,7 +304,7 @@ public: initializeEarlyCSEPass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; private: @@ -376,8 +377,8 @@ private: bool processNode(DomTreeNode *Node); // This transformation requires dominator postdominator info - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DominatorTree>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetLibraryInfo>(); AU.setPreservesCFG(); } @@ -392,7 +393,7 @@ FunctionPass *llvm::createEarlyCSEPass() { } INITIALIZE_PASS_BEGIN(EarlyCSE, "early-cse", "Early CSE", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false) @@ -405,14 +406,14 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // have invalidated the live-out memory values of our parent value. For now, // just be conservative and invalidate memory if this block has multiple // predecessors. - if (BB->getSinglePredecessor() == 0) + if (!BB->getSinglePredecessor()) ++CurrentGeneration; /// LastStore - Keep track of the last non-volatile store that we saw... for /// as long as there in no instruction that reads memory. If we see a store /// to the same location, we delete the dead store. This zaps trivial dead /// stores which can occur in bitfield code among other things. - StoreInst *LastStore = 0; + StoreInst *LastStore = nullptr; bool Changed = false; @@ -432,7 +433,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // If the instruction can be simplified (e.g. X+0 = X) then replace it with // its simpler value. - if (Value *V = SimplifyInstruction(Inst, TD, TLI, DT)) { + if (Value *V = SimplifyInstruction(Inst, DL, TLI, DT)) { DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n'); Inst->replaceAllUsesWith(V); Inst->eraseFromParent(); @@ -462,7 +463,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { // Ignore volatile loads. if (!LI->isSimple()) { - LastStore = 0; + LastStore = nullptr; continue; } @@ -470,7 +471,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // generation, replace this instruction. std::pair<Value*, unsigned> InVal = AvailableLoads->lookup(Inst->getOperand(0)); - if (InVal.first != 0 && InVal.second == CurrentGeneration) { + if (InVal.first != nullptr && InVal.second == CurrentGeneration) { DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst << " to: " << *InVal.first << '\n'); if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); @@ -483,20 +484,20 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // Otherwise, remember that we have this instruction. AvailableLoads->insert(Inst->getOperand(0), std::pair<Value*, unsigned>(Inst, CurrentGeneration)); - LastStore = 0; + LastStore = nullptr; continue; } // If this instruction may read from memory, forget LastStore. if (Inst->mayReadFromMemory()) - LastStore = 0; + LastStore = nullptr; // If this is a read-only call, process it. if (CallValue::canHandle(Inst)) { // If we have an available version of this call, and if it is the right // generation, replace this instruction. std::pair<Value*, unsigned> InVal = AvailableCalls->lookup(Inst); - if (InVal.first != 0 && InVal.second == CurrentGeneration) { + if (InVal.first != nullptr && InVal.second == CurrentGeneration) { DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst << " to: " << *InVal.first << '\n'); if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); @@ -528,7 +529,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { LastStore->eraseFromParent(); Changed = true; ++NumDSE; - LastStore = 0; + LastStore = nullptr; continue; } @@ -552,11 +553,15 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { bool EarlyCSE::runOnFunction(Function &F) { - std::deque<StackNode *> nodesToProcess; + if (skipOptnoneFunction(F)) + return false; + + std::vector<StackNode *> nodesToProcess; - TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = &getAnalysis<TargetLibraryInfo>(); - DT = &getAnalysis<DominatorTree>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); // Tables that the pass uses when walking the domtree. ScopedHTType AVTable; @@ -570,7 +575,7 @@ bool EarlyCSE::runOnFunction(Function &F) { bool Changed = false; // Process the root node. - nodesToProcess.push_front( + nodesToProcess.push_back( new StackNode(AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration, DT->getRootNode(), DT->getRootNode()->begin(), @@ -583,7 +588,7 @@ bool EarlyCSE::runOnFunction(Function &F) { while (!nodesToProcess.empty()) { // Grab the first item off the stack. Set the current generation, remove // the node from the stack, and process it. - StackNode *NodeToProcess = nodesToProcess.front(); + StackNode *NodeToProcess = nodesToProcess.back(); // Initialize class members. CurrentGeneration = NodeToProcess->currentGeneration(); @@ -597,7 +602,7 @@ bool EarlyCSE::runOnFunction(Function &F) { } else if (NodeToProcess->childIter() != NodeToProcess->end()) { // Push the next child onto the stack. DomTreeNode *child = NodeToProcess->nextChild(); - nodesToProcess.push_front( + nodesToProcess.push_back( new StackNode(AvailableValues, AvailableLoads, AvailableCalls, @@ -607,7 +612,7 @@ bool EarlyCSE::runOnFunction(Function &F) { // It has been processed, and there are no more children to process, // so delete it and pop it off the stack. delete NodeToProcess; - nodesToProcess.pop_front(); + nodesToProcess.pop_back(); } } // while (!nodes...) diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp index e7de07f..0430c18 100644 --- a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -11,14 +11,15 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "flattencfg" #include "llvm/Transforms/Scalar.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/IR/CFG.h" #include "llvm/Pass.h" -#include "llvm/Support/CFG.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "flattencfg" + namespace { struct FlattenCFGPass : public FunctionPass { static char ID; // Pass identification, replacement for typeid @@ -26,9 +27,9 @@ public: FlattenCFGPass() : FunctionPass(ID) { initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AliasAnalysis>(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp index 6af269d..106eba0 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp @@ -15,35 +15,34 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "gvn" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Hashing.h" -#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/PHITransAddr.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Assembly/Writer.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/PatternMatch.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -51,6 +50,8 @@ using namespace llvm; using namespace PatternMatch; +#define DEBUG_TYPE "gvn" + STATISTIC(NumGVNInstr, "Number of instructions deleted"); STATISTIC(NumGVNLoad, "Number of loads deleted"); STATISTIC(NumGVNPRE, "Number of instructions PRE'd"); @@ -214,13 +215,13 @@ Expression ValueTable::create_cmp_expression(unsigned Opcode, } Expression ValueTable::create_extractvalue_expression(ExtractValueInst *EI) { - assert(EI != 0 && "Not an ExtractValueInst?"); + assert(EI && "Not an ExtractValueInst?"); Expression e; e.type = EI->getType(); e.opcode = 0; IntrinsicInst *I = dyn_cast<IntrinsicInst>(EI->getAggregateOperand()); - if (I != 0 && EI->getNumIndices() == 1 && *EI->idx_begin() == 0 ) { + if (I != nullptr && EI->getNumIndices() == 1 && *EI->idx_begin() == 0 ) { // EI might be an extract from one of our recognised intrinsics. If it // is we'll synthesize a semantically equivalent expression instead on // an extract value expression. @@ -328,7 +329,7 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) { const MemoryDependenceAnalysis::NonLocalDepInfo &deps = MD->getNonLocalCallDependency(CallSite(C)); // FIXME: Move the checking logic to MemDep! - CallInst* cdep = 0; + CallInst* cdep = nullptr; // Check to see if we have a single dominating call instruction that is // identical to C. @@ -339,8 +340,8 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) { // We don't handle non-definitions. If we already have a call, reject // instruction dependencies. - if (!I->getResult().isDef() || cdep != 0) { - cdep = 0; + if (!I->getResult().isDef() || cdep != nullptr) { + cdep = nullptr; break; } @@ -351,7 +352,7 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) { continue; } - cdep = 0; + cdep = nullptr; break; } @@ -552,7 +553,7 @@ namespace { static AvailableValueInBlock getUndef(BasicBlock *BB) { AvailableValueInBlock Res; Res.BB = BB; - Res.Val.setPointer(0); + Res.Val.setPointer(nullptr); Res.Val.setInt(UndefVal); Res.Offset = 0; return Res; @@ -587,7 +588,7 @@ namespace { bool NoLoads; MemoryDependenceAnalysis *MD; DominatorTree *DT; - const DataLayout *TD; + const DataLayout *DL; const TargetLibraryInfo *TLI; SetVector<BasicBlock *> DeadBlocks; @@ -612,11 +613,11 @@ namespace { public: static char ID; // Pass identification, replacement for typeid explicit GVN(bool noloads = false) - : FunctionPass(ID), NoLoads(noloads), MD(0) { + : FunctionPass(ID), NoLoads(noloads), MD(nullptr) { initializeGVNPass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; /// markInstructionForDeletion - This removes the specified instruction from /// our various maps and marks it for deletion. @@ -625,7 +626,7 @@ namespace { InstrsToErase.push_back(I); } - const DataLayout *getDataLayout() const { return TD; } + const DataLayout *getDataLayout() const { return DL; } DominatorTree &getDominatorTree() const { return *DT; } AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); } MemoryDependenceAnalysis &getMemDep() const { return *MD; } @@ -650,7 +651,7 @@ namespace { /// removeFromLeaderTable - Scan the list of values corresponding to a given /// value number, and remove the given instruction if encountered. void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) { - LeaderTableEntry* Prev = 0; + LeaderTableEntry* Prev = nullptr; LeaderTableEntry* Curr = &LeaderTable[N]; while (Curr->Val != I || Curr->BB != BB) { @@ -662,8 +663,8 @@ namespace { Prev->Next = Curr->Next; } else { if (!Curr->Next) { - Curr->Val = 0; - Curr->BB = 0; + Curr->Val = nullptr; + Curr->BB = nullptr; } else { LeaderTableEntry* Next = Curr->Next; Curr->Val = Next->Val; @@ -677,14 +678,14 @@ namespace { SmallVector<std::pair<TerminatorInst*, unsigned>, 4> toSplit; // This transformation requires dominator postdominator info - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DominatorTree>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetLibraryInfo>(); if (!NoLoads) AU.addRequired<MemoryDependenceAnalysis>(); AU.addRequired<AliasAnalysis>(); - AU.addPreserved<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<AliasAnalysis>(); } @@ -727,7 +728,7 @@ FunctionPass *llvm::createGVNPass(bool NoLoads) { INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false) @@ -818,8 +819,7 @@ SpeculationFailure: // Mark as unavailable. EntryVal = 0; - for (succ_iterator I = succ_begin(Entry), E = succ_end(Entry); I != E; ++I) - BBWorklist.push_back(*I); + BBWorklist.append(succ_begin(Entry), succ_end(Entry)); } while (!BBWorklist.empty()); return false; @@ -830,7 +830,7 @@ SpeculationFailure: /// CoerceAvailableValueToLoadType will succeed. static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, - const DataLayout &TD) { + const DataLayout &DL) { // If the loaded or stored value is an first class array or struct, don't try // to transform them. We need to be able to bitcast to integer. if (LoadTy->isStructTy() || LoadTy->isArrayTy() || @@ -839,8 +839,8 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, return false; // The store has to be at least as big as the load. - if (TD.getTypeSizeInBits(StoredVal->getType()) < - TD.getTypeSizeInBits(LoadTy)) + if (DL.getTypeSizeInBits(StoredVal->getType()) < + DL.getTypeSizeInBits(LoadTy)) return false; return true; @@ -855,15 +855,15 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, Instruction *InsertPt, - const DataLayout &TD) { - if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD)) - return 0; + const DataLayout &DL) { + if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL)) + return nullptr; // If this is already the right type, just return it. Type *StoredValTy = StoredVal->getType(); - uint64_t StoreSize = TD.getTypeSizeInBits(StoredValTy); - uint64_t LoadSize = TD.getTypeSizeInBits(LoadedTy); + uint64_t StoreSize = DL.getTypeSizeInBits(StoredValTy); + uint64_t LoadSize = DL.getTypeSizeInBits(LoadedTy); // If the store and reload are the same size, we can always reuse it. if (StoreSize == LoadSize) { @@ -874,13 +874,13 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, // Convert source pointers to integers, which can be bitcast. if (StoredValTy->getScalarType()->isPointerTy()) { - StoredValTy = TD.getIntPtrType(StoredValTy); + StoredValTy = DL.getIntPtrType(StoredValTy); StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); } Type *TypeToCastTo = LoadedTy; if (TypeToCastTo->getScalarType()->isPointerTy()) - TypeToCastTo = TD.getIntPtrType(TypeToCastTo); + TypeToCastTo = DL.getIntPtrType(TypeToCastTo); if (StoredValTy != TypeToCastTo) StoredVal = new BitCastInst(StoredVal, TypeToCastTo, "", InsertPt); @@ -899,7 +899,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, // Convert source pointers to integers, which can be manipulated. if (StoredValTy->getScalarType()->isPointerTy()) { - StoredValTy = TD.getIntPtrType(StoredValTy); + StoredValTy = DL.getIntPtrType(StoredValTy); StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); } @@ -911,7 +911,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, // If this is a big-endian system, we need to shift the value down to the low // bits so that a truncate will work. - if (TD.isBigEndian()) { + if (DL.isBigEndian()) { Constant *Val = ConstantInt::get(StoredVal->getType(), StoreSize-LoadSize); StoredVal = BinaryOperator::CreateLShr(StoredVal, Val, "tmp", InsertPt); } @@ -942,15 +942,15 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, Value *WritePtr, uint64_t WriteSizeInBits, - const DataLayout &TD) { + const DataLayout &DL) { // If the loaded or stored value is a first class array or struct, don't try // to transform them. We need to be able to bitcast to integer. if (LoadTy->isStructTy() || LoadTy->isArrayTy()) return -1; int64_t StoreOffset = 0, LoadOffset = 0; - Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr,StoreOffset,&TD); - Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, &TD); + Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr,StoreOffset,&DL); + Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, &DL); if (StoreBase != LoadBase) return -1; @@ -972,7 +972,7 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, // If the load and store don't overlap at all, the store doesn't provide // anything to the load. In this case, they really don't alias at all, AA // must have gotten confused. - uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy); + uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy); if ((WriteSizeInBits & 7) | (LoadSize & 7)) return -1; @@ -1015,61 +1015,61 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, /// memdep query of a load that ends up being a clobbering store. static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, StoreInst *DepSI, - const DataLayout &TD) { + const DataLayout &DL) { // Cannot handle reading from store of first-class aggregate yet. if (DepSI->getValueOperand()->getType()->isStructTy() || DepSI->getValueOperand()->getType()->isArrayTy()) return -1; Value *StorePtr = DepSI->getPointerOperand(); - uint64_t StoreSize =TD.getTypeSizeInBits(DepSI->getValueOperand()->getType()); + uint64_t StoreSize =DL.getTypeSizeInBits(DepSI->getValueOperand()->getType()); return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, - StorePtr, StoreSize, TD); + StorePtr, StoreSize, DL); } /// AnalyzeLoadFromClobberingLoad - This function is called when we have a /// memdep query of a load that ends up being clobbered by another load. See if /// the other load can feed into the second load. static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, - LoadInst *DepLI, const DataLayout &TD){ + LoadInst *DepLI, const DataLayout &DL){ // Cannot handle reading from store of first-class aggregate yet. if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy()) return -1; Value *DepPtr = DepLI->getPointerOperand(); - uint64_t DepSize = TD.getTypeSizeInBits(DepLI->getType()); - int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, TD); + uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType()); + int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL); if (R != -1) return R; // If we have a load/load clobber an DepLI can be widened to cover this load, // then we should widen it! int64_t LoadOffs = 0; const Value *LoadBase = - GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, &TD); - unsigned LoadSize = TD.getTypeStoreSize(LoadTy); + GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, &DL); + unsigned LoadSize = DL.getTypeStoreSize(LoadTy); unsigned Size = MemoryDependenceAnalysis:: - getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI, TD); + getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI, DL); if (Size == 0) return -1; - return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, TD); + return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, DL); } static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, MemIntrinsic *MI, - const DataLayout &TD) { + const DataLayout &DL) { // If the mem operation is a non-constant size, we can't handle it. ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength()); - if (SizeCst == 0) return -1; + if (!SizeCst) return -1; uint64_t MemSizeInBits = SizeCst->getZExtValue()*8; // If this is memset, we just need to see if the offset is valid in the size // of the memset.. if (MI->getIntrinsicID() == Intrinsic::memset) return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(), - MemSizeInBits, TD); + MemSizeInBits, DL); // If we have a memcpy/memmove, the only case we can handle is if this is a // copy from constant memory. In that case, we can read directly from the @@ -1077,14 +1077,14 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, MemTransferInst *MTI = cast<MemTransferInst>(MI); Constant *Src = dyn_cast<Constant>(MTI->getSource()); - if (Src == 0) return -1; + if (!Src) return -1; - GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &TD)); - if (GV == 0 || !GV->isConstant()) return -1; + GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &DL)); + if (!GV || !GV->isConstant()) return -1; // See if the access is within the bounds of the transfer. int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, - MI->getDest(), MemSizeInBits, TD); + MI->getDest(), MemSizeInBits, DL); if (Offset == -1) return Offset; @@ -1097,7 +1097,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); - if (ConstantFoldLoadFromConstPtr(Src, &TD)) + if (ConstantFoldLoadFromConstPtr(Src, &DL)) return Offset; return -1; } @@ -1110,11 +1110,11 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, /// before we give up. static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy, - Instruction *InsertPt, const DataLayout &TD){ + Instruction *InsertPt, const DataLayout &DL){ LLVMContext &Ctx = SrcVal->getType()->getContext(); - uint64_t StoreSize = (TD.getTypeSizeInBits(SrcVal->getType()) + 7) / 8; - uint64_t LoadSize = (TD.getTypeSizeInBits(LoadTy) + 7) / 8; + uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8; + uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8; IRBuilder<> Builder(InsertPt->getParent(), InsertPt); @@ -1122,13 +1122,13 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, // to an integer type to start with. if (SrcVal->getType()->getScalarType()->isPointerTy()) SrcVal = Builder.CreatePtrToInt(SrcVal, - TD.getIntPtrType(SrcVal->getType())); + DL.getIntPtrType(SrcVal->getType())); if (!SrcVal->getType()->isIntegerTy()) SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8)); // Shift the bits to the least significant depending on endianness. unsigned ShiftAmt; - if (TD.isLittleEndian()) + if (DL.isLittleEndian()) ShiftAmt = Offset*8; else ShiftAmt = (StoreSize-LoadSize-Offset)*8; @@ -1139,7 +1139,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, if (LoadSize != StoreSize) SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8)); - return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD); + return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, DL); } /// GetLoadValueForLoad - This function is called when we have a @@ -1150,11 +1150,11 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy, Instruction *InsertPt, GVN &gvn) { - const DataLayout &TD = *gvn.getDataLayout(); + const DataLayout &DL = *gvn.getDataLayout(); // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to // widen SrcVal out to a larger load. - unsigned SrcValSize = TD.getTypeStoreSize(SrcVal->getType()); - unsigned LoadSize = TD.getTypeStoreSize(LoadTy); + unsigned SrcValSize = DL.getTypeStoreSize(SrcVal->getType()); + unsigned LoadSize = DL.getTypeStoreSize(LoadTy); if (Offset+LoadSize > SrcValSize) { assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!"); assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load"); @@ -1186,7 +1186,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, // Replace uses of the original load with the wider load. On a big endian // system, we need to shift down to get the relevant bits. Value *RV = NewLoad; - if (TD.isBigEndian()) + if (DL.isBigEndian()) RV = Builder.CreateLShr(RV, NewLoadSize*8-SrcVal->getType()->getPrimitiveSizeInBits()); RV = Builder.CreateTrunc(RV, SrcVal->getType()); @@ -1201,7 +1201,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, SrcVal = NewLoad; } - return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, TD); + return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL); } @@ -1209,9 +1209,9 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, /// memdep query of a load that ends up being a clobbering mem intrinsic. static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, Type *LoadTy, Instruction *InsertPt, - const DataLayout &TD){ + const DataLayout &DL){ LLVMContext &Ctx = LoadTy->getContext(); - uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8; + uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy)/8; IRBuilder<> Builder(InsertPt->getParent(), InsertPt); @@ -1242,7 +1242,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, ++NumBytesSet; } - return CoerceAvailableValueToLoadType(Val, LoadTy, InsertPt, TD); + return CoerceAvailableValueToLoadType(Val, LoadTy, InsertPt, DL); } // Otherwise, this is a memcpy/memmove from a constant global. @@ -1258,7 +1258,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); - return ConstantFoldLoadFromConstPtr(Src, &TD); + return ConstantFoldLoadFromConstPtr(Src, &DL); } @@ -1324,10 +1324,10 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c if (isSimpleValue()) { Res = getSimpleValue(); if (Res->getType() != LoadTy) { - const DataLayout *TD = gvn.getDataLayout(); - assert(TD && "Need target data to handle type mismatch case"); + const DataLayout *DL = gvn.getDataLayout(); + assert(DL && "Need target data to handle type mismatch case"); Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), - *TD); + *DL); DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " << *getSimpleValue() << '\n' @@ -1346,10 +1346,10 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c << *Res << '\n' << "\n\n\n"); } } else if (isMemIntrinValue()) { - const DataLayout *TD = gvn.getDataLayout(); - assert(TD && "Need target data to handle type mismatch case"); + const DataLayout *DL = gvn.getDataLayout(); + assert(DL && "Need target data to handle type mismatch case"); Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, - LoadTy, BB->getTerminator(), *TD); + LoadTy, BB->getTerminator(), *DL); DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset << " " << *getMemIntrinValue() << '\n' << *Res << '\n' << "\n\n\n"); @@ -1402,9 +1402,9 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, // read by the load, we can extract the bits we need for the load from the // stored value. if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) { - if (TD && Address) { + if (DL && Address) { int Offset = AnalyzeLoadFromClobberingStore(LI->getType(), Address, - DepSI, *TD); + DepSI, *DL); if (Offset != -1) { ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, DepSI->getValueOperand(), @@ -1421,10 +1421,9 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) { // If this is a clobber and L is the first instruction in its block, then // we have the first instruction in the entry block. - if (DepLI != LI && Address && TD) { - int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(), - LI->getPointerOperand(), - DepLI, *TD); + if (DepLI != LI && Address && DL) { + int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(), Address, + DepLI, *DL); if (Offset != -1) { ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI, @@ -1437,9 +1436,9 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, // If the clobbering value is a memset/memcpy/memmove, see if we can // forward a value on from it. if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) { - if (TD && Address) { + if (DL && Address) { int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address, - DepMI, *TD); + DepMI, *DL); if (Offset != -1) { ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI, Offset)); @@ -1465,14 +1464,21 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, continue; } + // Loading from calloc (which zero initializes memory) -> zero + if (isCallocLikeFn(DepInst, TLI)) { + ValuesPerBlock.push_back(AvailableValueInBlock::get( + DepBB, Constant::getNullValue(LI->getType()))); + continue; + } + if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) { // Reject loads and stores that are to the same address but are of // different types if we have to. if (S->getValueOperand()->getType() != LI->getType()) { // If the stored value is larger or equal to the loaded value, we can // reuse it. - if (TD == 0 || !CanCoerceMustAliasedValueToLoad(S->getValueOperand(), - LI->getType(), *TD)) { + if (!DL || !CanCoerceMustAliasedValueToLoad(S->getValueOperand(), + LI->getType(), *DL)) { UnavailableBlocks.push_back(DepBB); continue; } @@ -1488,7 +1494,7 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, if (LD->getType() != LI->getType()) { // If the stored value is larger or equal to the loaded value, we can // reuse it. - if (TD == 0 || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*TD)){ + if (!DL || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*DL)) { UnavailableBlocks.push_back(DepBB); continue; } @@ -1541,7 +1547,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // Check to see how many predecessors have the loaded value fully // available. - DenseMap<BasicBlock*, Value*> PredLoads; + MapVector<BasicBlock *, Value *> PredLoads; DenseMap<BasicBlock*, char> FullyAvailableBlocks; for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) FullyAvailableBlocks[ValuesPerBlock[i].BB] = true; @@ -1555,7 +1561,6 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) { continue; } - PredLoads[Pred] = 0; if (Pred->getTerminator()->getNumSuccessors() != 1) { if (isa<IndirectBrInst>(Pred->getTerminator())) { @@ -1572,11 +1577,14 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, } CriticalEdgePred.push_back(Pred); + } else { + // Only add the predecessors that will not be split for now. + PredLoads[Pred] = nullptr; } } // Decide whether PRE is profitable for this load. - unsigned NumUnavailablePreds = PredLoads.size(); + unsigned NumUnavailablePreds = PredLoads.size() + CriticalEdgePred.size(); assert(NumUnavailablePreds != 0 && "Fully available value should already be eliminated!"); @@ -1588,12 +1596,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, return false; // Split critical edges, and update the unavailable predecessors accordingly. - for (SmallVectorImpl<BasicBlock *>::iterator I = CriticalEdgePred.begin(), - E = CriticalEdgePred.end(); I != E; I++) { - BasicBlock *OrigPred = *I; + for (BasicBlock *OrigPred : CriticalEdgePred) { BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB); - PredLoads.erase(OrigPred); - PredLoads[NewPred] = 0; + assert(!PredLoads.count(OrigPred) && "Split edges shouldn't be in map!"); + PredLoads[NewPred] = nullptr; DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->" << LoadBB->getName() << '\n'); } @@ -1601,9 +1607,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // Check if the load can safely be moved to all the unavailable predecessors. bool CanDoPRE = true; SmallVector<Instruction*, 8> NewInsts; - for (DenseMap<BasicBlock*, Value*>::iterator I = PredLoads.begin(), - E = PredLoads.end(); I != E; ++I) { - BasicBlock *UnavailablePred = I->first; + for (auto &PredLoad : PredLoads) { + BasicBlock *UnavailablePred = PredLoad.first; // Do PHI translation to get its value in the predecessor if necessary. The // returned pointer (if non-null) is guaranteed to dominate UnavailablePred. @@ -1611,21 +1616,21 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // If all preds have a single successor, then we know it is safe to insert // the load on the pred (?!?), so we can insert code to materialize the // pointer if it is not available. - PHITransAddr Address(LI->getPointerOperand(), TD); - Value *LoadPtr = 0; + PHITransAddr Address(LI->getPointerOperand(), DL); + Value *LoadPtr = nullptr; LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, *DT, NewInsts); // If we couldn't find or insert a computation of this phi translated value, // we fail PRE. - if (LoadPtr == 0) { + if (!LoadPtr) { DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: " << *LI->getPointerOperand() << "\n"); CanDoPRE = false; break; } - I->second = LoadPtr; + PredLoad.second = LoadPtr; } if (!CanDoPRE) { @@ -1634,8 +1639,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (MD) MD->removeInstruction(I); I->eraseFromParent(); } - // HINT:Don't revert the edge-splitting as following transformation may - // also need to split these critial edges. + // HINT: Don't revert the edge-splitting as following transformation may + // also need to split these critical edges. return !CriticalEdgePred.empty(); } @@ -1656,10 +1661,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, VN.lookup_or_add(NewInsts[i]); } - for (DenseMap<BasicBlock*, Value*>::iterator I = PredLoads.begin(), - E = PredLoads.end(); I != E; ++I) { - BasicBlock *UnavailablePred = I->first; - Value *LoadPtr = I->second; + for (const auto &PredLoad : PredLoads) { + BasicBlock *UnavailablePred = PredLoad.first; + Value *LoadPtr = PredLoad.second; Instruction *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false, LI->getAlignment(), @@ -1712,7 +1716,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) { DEBUG( dbgs() << "GVN: non-local load "; - WriteAsOperand(dbgs(), LI); + LI->printAsOperand(dbgs()); dbgs() << " has unknown dependencies\n"; ); return false; @@ -1778,7 +1782,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) { MDNode *ReplMD = Metadata[i].second; switch(Kind) { default: - ReplInst->setMetadata(Kind, NULL); // Remove unknown metadata + ReplInst->setMetadata(Kind, nullptr); // Remove unknown metadata break; case LLVMContext::MD_dbg: llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg"); @@ -1789,11 +1793,15 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) { ReplInst->setMetadata(Kind, MDNode::getMostGenericRange(IMD, ReplMD)); break; case LLVMContext::MD_prof: - llvm_unreachable("MD_prof in a non terminator instruction"); + llvm_unreachable("MD_prof in a non-terminator instruction"); break; case LLVMContext::MD_fpmath: ReplInst->setMetadata(Kind, MDNode::getMostGenericFPMath(IMD, ReplMD)); break; + case LLVMContext::MD_invariant_load: + // Only set the !invariant.load if it is present in both instructions. + ReplInst->setMetadata(Kind, IMD); + break; } } } @@ -1823,7 +1831,7 @@ bool GVN::processLoad(LoadInst *L) { // If we have a clobber and target data is around, see if this is a clobber // that we can fix up through code synthesis. - if (Dep.isClobber() && TD) { + if (Dep.isClobber() && DL) { // Check to see if we have something like this: // store i32 123, i32* %P // %A = bitcast i32* %P to i8* @@ -1834,14 +1842,14 @@ bool GVN::processLoad(LoadInst *L) { // a common base + constant offset, and if the previous store (or memset) // completely covers this load. This sort of thing can happen in bitfield // access code. - Value *AvailVal = 0; + Value *AvailVal = nullptr; if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) { int Offset = AnalyzeLoadFromClobberingStore(L->getType(), L->getPointerOperand(), - DepSI, *TD); + DepSI, *DL); if (Offset != -1) AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset, - L->getType(), L, *TD); + L->getType(), L, *DL); } // Check to see if we have something like this: @@ -1856,7 +1864,7 @@ bool GVN::processLoad(LoadInst *L) { int Offset = AnalyzeLoadFromClobberingLoad(L->getType(), L->getPointerOperand(), - DepLI, *TD); + DepLI, *DL); if (Offset != -1) AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this); } @@ -1866,9 +1874,9 @@ bool GVN::processLoad(LoadInst *L) { if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) { int Offset = AnalyzeLoadFromClobberingMemInst(L->getType(), L->getPointerOperand(), - DepMI, *TD); + DepMI, *DL); if (Offset != -1) - AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, *TD); + AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, *DL); } if (AvailVal) { @@ -1890,7 +1898,7 @@ bool GVN::processLoad(LoadInst *L) { DEBUG( // fast print dep, using operator<< on instruction is too slow. dbgs() << "GVN: load "; - WriteAsOperand(dbgs(), L); + L->printAsOperand(dbgs()); Instruction *I = Dep.getInst(); dbgs() << " is clobbered by " << *I << '\n'; ); @@ -1905,7 +1913,7 @@ bool GVN::processLoad(LoadInst *L) { DEBUG( // fast print dep, using operator<< on instruction is too slow. dbgs() << "GVN: load "; - WriteAsOperand(dbgs(), L); + L->printAsOperand(dbgs()); dbgs() << " has unknown dependence\n"; ); return false; @@ -1919,10 +1927,10 @@ bool GVN::processLoad(LoadInst *L) { // actually have the same type. See if we know how to reuse the stored // value (depending on its type). if (StoredVal->getType() != L->getType()) { - if (TD) { + if (DL) { StoredVal = CoerceAvailableValueToLoadType(StoredVal, L->getType(), - L, *TD); - if (StoredVal == 0) + L, *DL); + if (!StoredVal) return false; DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal @@ -1948,10 +1956,10 @@ bool GVN::processLoad(LoadInst *L) { // the same type. See if we know how to reuse the previously loaded value // (depending on its type). if (DepLI->getType() != L->getType()) { - if (TD) { + if (DL) { AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(), - L, *TD); - if (AvailableVal == 0) + L, *DL); + if (!AvailableVal) return false; DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal @@ -1991,6 +1999,15 @@ bool GVN::processLoad(LoadInst *L) { } } + // If this load follows a calloc (which zero initializes memory), + // then the loaded value is zero + if (isCallocLikeFn(DepInst, TLI)) { + L->replaceAllUsesWith(Constant::getNullValue(L->getType())); + markInstructionForDeletion(L); + ++NumGVNLoad; + return true; + } + return false; } @@ -2001,9 +2018,9 @@ bool GVN::processLoad(LoadInst *L) { // a few comparisons of DFS numbers. Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) { LeaderTableEntry Vals = LeaderTable[num]; - if (!Vals.Val) return 0; + if (!Vals.Val) return nullptr; - Value *Val = 0; + Value *Val = nullptr; if (DT->dominates(Vals.BB, BB)) { Val = Vals.Val; if (isa<Constant>(Val)) return Val; @@ -2030,7 +2047,7 @@ unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To, unsigned Count = 0; for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); UI != UE; ) { - Use &U = (UI++).getUse(); + Use &U = *UI++; if (DT->dominates(Root, U)) { U.set(To); @@ -2054,7 +2071,7 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E, const BasicBlock *Src = E.getStart(); assert((!Pred || Pred == Src) && "No edge between these basic blocks!"); (void)Src; - return Pred != 0; + return Pred != nullptr; } /// propagateEquality - The given values are known to be equal in every block @@ -2202,7 +2219,7 @@ bool GVN::processInstruction(Instruction *I) { // to value numbering it. Value numbering often exposes redundancies, for // example if it determines that %y is equal to %x then the instruction // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. - if (Value *V = SimplifyInstruction(I, TD, TLI, DT)) { + if (Value *V = SimplifyInstruction(I, DL, TLI, DT)) { I->replaceAllUsesWith(V); if (MD && V->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); @@ -2298,7 +2315,7 @@ bool GVN::processInstruction(Instruction *I) { // Perform fast-path value-number based elimination of values inherited from // dominators. Value *repl = findLeader(I->getParent(), Num); - if (repl == 0) { + if (!repl) { // Failure, just remember this instance for future use. addToLeaderTable(Num, I, I->getParent()); return false; @@ -2314,10 +2331,14 @@ bool GVN::processInstruction(Instruction *I) { /// runOnFunction - This is the main transformation entry point for a function. bool GVN::runOnFunction(Function& F) { + if (skipOptnoneFunction(F)) + return false; + if (!NoLoads) MD = &getAnalysis<MemoryDependenceAnalysis>(); - DT = &getAnalysis<DominatorTree>(); - TD = getAnalysisIfAvailable<DataLayout>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = &getAnalysis<TargetLibraryInfo>(); VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>()); VN.setMemDep(MD); @@ -2419,10 +2440,7 @@ bool GVN::processBlock(BasicBlock *BB) { bool GVN::performPRE(Function &F) { bool Changed = false; SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap; - for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()), - DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) { - BasicBlock *CurrentBlock = *DI; - + for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) { // Nothing to PRE in the entry block. if (CurrentBlock == &F.getEntryBlock()) continue; @@ -2462,7 +2480,7 @@ bool GVN::performPRE(Function &F) { // more complicated to get right. unsigned NumWith = 0; unsigned NumWithout = 0; - BasicBlock *PREPred = 0; + BasicBlock *PREPred = nullptr; predMap.clear(); for (pred_iterator PI = pred_begin(CurrentBlock), @@ -2480,8 +2498,8 @@ bool GVN::performPRE(Function &F) { } Value* predV = findLeader(P, ValNo); - if (predV == 0) { - predMap.push_back(std::make_pair(static_cast<Value *>(0), P)); + if (!predV) { + predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P)); PREPred = P; ++NumWithout; } else if (predV == CurInst) { @@ -2635,9 +2653,8 @@ bool GVN::iterateOnFunction(Function &F) { // std::vector<BasicBlock *> BBVect; BBVect.reserve(256); - for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()), - DE = df_end(DT->getRootNode()); DI != DE; ++DI) - BBVect.push_back(DI->getBlock()); + for (DomTreeNode *x : depth_first(DT->getRootNode())) + BBVect.push_back(x->getBlock()); for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end(); I != E; I++) diff --git a/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp b/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp deleted file mode 100644 index 954e545..0000000 --- a/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp +++ /dev/null @@ -1,310 +0,0 @@ -//===-- GlobalMerge.cpp - Internal globals merging -----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// This pass merges globals with internal linkage into one. This way all the -// globals which were merged into a biggest one can be addressed using offsets -// from the same base pointer (no need for separate base pointer for each of the -// global). Such a transformation can significantly reduce the register pressure -// when many globals are involved. -// -// For example, consider the code which touches several global variables at -// once: -// -// static int foo[N], bar[N], baz[N]; -// -// for (i = 0; i < N; ++i) { -// foo[i] = bar[i] * baz[i]; -// } -// -// On ARM the addresses of 3 arrays should be kept in the registers, thus -// this code has quite large register pressure (loop body): -// -// ldr r1, [r5], #4 -// ldr r2, [r6], #4 -// mul r1, r2, r1 -// str r1, [r0], #4 -// -// Pass converts the code to something like: -// -// static struct { -// int foo[N]; -// int bar[N]; -// int baz[N]; -// } merged; -// -// for (i = 0; i < N; ++i) { -// merged.foo[i] = merged.bar[i] * merged.baz[i]; -// } -// -// and in ARM code this becomes: -// -// ldr r0, [r5, #40] -// ldr r1, [r5, #80] -// mul r0, r1, r0 -// str r0, [r5], #4 -// -// note that we saved 2 registers here almostly "for free". -// ===---------------------------------------------------------------------===// - -#define DEBUG_TYPE "global-merge" -#include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/IR/Attributes.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Target/TargetLoweringObjectFile.h" -using namespace llvm; - -static cl::opt<bool> -EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden, - cl::desc("Enable global merge pass on constants"), - cl::init(false)); - -STATISTIC(NumMerged , "Number of globals merged"); -namespace { - class GlobalMerge : public FunctionPass { - const TargetMachine *TM; - - bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals, - Module &M, bool isConst, unsigned AddrSpace) const; - - /// \brief Check if the given variable has been identified as must keep - /// \pre setMustKeepGlobalVariables must have been called on the Module that - /// contains GV - bool isMustKeepGlobalVariable(const GlobalVariable *GV) const { - return MustKeepGlobalVariables.count(GV); - } - - /// Collect every variables marked as "used" or used in a landing pad - /// instruction for this Module. - void setMustKeepGlobalVariables(Module &M); - - /// Collect every variables marked as "used" - void collectUsedGlobalVariables(Module &M); - - /// Keep track of the GlobalVariable that must not be merged away - SmallPtrSet<const GlobalVariable *, 16> MustKeepGlobalVariables; - - public: - static char ID; // Pass identification, replacement for typeid. - explicit GlobalMerge(const TargetMachine *TM = 0) - : FunctionPass(ID), TM(TM) { - initializeGlobalMergePass(*PassRegistry::getPassRegistry()); - } - - virtual bool doInitialization(Module &M); - virtual bool runOnFunction(Function &F); - virtual bool doFinalization(Module &M); - - const char *getPassName() const { - return "Merge internal globals"; - } - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesCFG(); - FunctionPass::getAnalysisUsage(AU); - } - - struct GlobalCmp { - const DataLayout *TD; - - GlobalCmp(const DataLayout *td) : TD(td) { } - - bool operator()(const GlobalVariable *GV1, const GlobalVariable *GV2) { - Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType(); - Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType(); - - return (TD->getTypeAllocSize(Ty1) < TD->getTypeAllocSize(Ty2)); - } - }; - }; -} // end anonymous namespace - -char GlobalMerge::ID = 0; -INITIALIZE_PASS(GlobalMerge, "global-merge", - "Global Merge", false, false) - - -bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, - Module &M, bool isConst, unsigned AddrSpace) const { - const TargetLowering *TLI = TM->getTargetLowering(); - const DataLayout *TD = TLI->getDataLayout(); - - // FIXME: Infer the maximum possible offset depending on the actual users - // (these max offsets are different for the users inside Thumb or ARM - // functions) - unsigned MaxOffset = TLI->getMaximalGlobalOffset(); - - // FIXME: Find better heuristics - std::stable_sort(Globals.begin(), Globals.end(), GlobalCmp(TD)); - - Type *Int32Ty = Type::getInt32Ty(M.getContext()); - - for (size_t i = 0, e = Globals.size(); i != e; ) { - size_t j = 0; - uint64_t MergedSize = 0; - std::vector<Type*> Tys; - std::vector<Constant*> Inits; - for (j = i; j != e; ++j) { - Type *Ty = Globals[j]->getType()->getElementType(); - MergedSize += TD->getTypeAllocSize(Ty); - if (MergedSize > MaxOffset) { - break; - } - Tys.push_back(Ty); - Inits.push_back(Globals[j]->getInitializer()); - } - - StructType *MergedTy = StructType::get(M.getContext(), Tys); - Constant *MergedInit = ConstantStruct::get(MergedTy, Inits); - GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst, - GlobalValue::InternalLinkage, - MergedInit, "_MergedGlobals", - 0, GlobalVariable::NotThreadLocal, - AddrSpace); - for (size_t k = i; k < j; ++k) { - Constant *Idx[2] = { - ConstantInt::get(Int32Ty, 0), - ConstantInt::get(Int32Ty, k-i) - }; - Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx); - Globals[k]->replaceAllUsesWith(GEP); - Globals[k]->eraseFromParent(); - NumMerged++; - } - i = j; - } - - return true; -} - -void GlobalMerge::collectUsedGlobalVariables(Module &M) { - // Extract global variables from llvm.used array - const GlobalVariable *GV = M.getGlobalVariable("llvm.used"); - if (!GV || !GV->hasInitializer()) return; - - // Should be an array of 'i8*'. - const ConstantArray *InitList = cast<ConstantArray>(GV->getInitializer()); - - for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) - if (const GlobalVariable *G = - dyn_cast<GlobalVariable>(InitList->getOperand(i)->stripPointerCasts())) - MustKeepGlobalVariables.insert(G); -} - -void GlobalMerge::setMustKeepGlobalVariables(Module &M) { - collectUsedGlobalVariables(M); - - for (Module::iterator IFn = M.begin(), IEndFn = M.end(); IFn != IEndFn; - ++IFn) { - for (Function::iterator IBB = IFn->begin(), IEndBB = IFn->end(); - IBB != IEndBB; ++IBB) { - // Follow the inwoke link to find the landing pad instruction - const InvokeInst *II = dyn_cast<InvokeInst>(IBB->getTerminator()); - if (!II) continue; - - const LandingPadInst *LPInst = II->getUnwindDest()->getLandingPadInst(); - // Look for globals in the clauses of the landing pad instruction - for (unsigned Idx = 0, NumClauses = LPInst->getNumClauses(); - Idx != NumClauses; ++Idx) - if (const GlobalVariable *GV = - dyn_cast<GlobalVariable>(LPInst->getClause(Idx) - ->stripPointerCasts())) - MustKeepGlobalVariables.insert(GV); - } - } -} - -bool GlobalMerge::doInitialization(Module &M) { - DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals, - BSSGlobals; - const TargetLowering *TLI = TM->getTargetLowering(); - const DataLayout *TD = TLI->getDataLayout(); - unsigned MaxOffset = TLI->getMaximalGlobalOffset(); - bool Changed = false; - setMustKeepGlobalVariables(M); - - // Grab all non-const globals. - for (Module::global_iterator I = M.global_begin(), - E = M.global_end(); I != E; ++I) { - // Merge is safe for "normal" internal globals only - if (!I->hasLocalLinkage() || I->isThreadLocal() || I->hasSection()) - continue; - - PointerType *PT = dyn_cast<PointerType>(I->getType()); - assert(PT && "Global variable is not a pointer!"); - - unsigned AddressSpace = PT->getAddressSpace(); - - // Ignore fancy-aligned globals for now. - unsigned Alignment = TD->getPreferredAlignment(I); - Type *Ty = I->getType()->getElementType(); - if (Alignment > TD->getABITypeAlignment(Ty)) - continue; - - // Ignore all 'special' globals. - if (I->getName().startswith("llvm.") || - I->getName().startswith(".llvm.")) - continue; - - // Ignore all "required" globals: - if (isMustKeepGlobalVariable(I)) - continue; - - if (TD->getTypeAllocSize(Ty) < MaxOffset) { - if (TargetLoweringObjectFile::getKindForGlobal(I, TLI->getTargetMachine()) - .isBSSLocal()) - BSSGlobals[AddressSpace].push_back(I); - else if (I->isConstant()) - ConstGlobals[AddressSpace].push_back(I); - else - Globals[AddressSpace].push_back(I); - } - } - - for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator - I = Globals.begin(), E = Globals.end(); I != E; ++I) - if (I->second.size() > 1) - Changed |= doMerge(I->second, M, false, I->first); - - for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator - I = BSSGlobals.begin(), E = BSSGlobals.end(); I != E; ++I) - if (I->second.size() > 1) - Changed |= doMerge(I->second, M, false, I->first); - - if (EnableGlobalMergeOnConst) - for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator - I = ConstGlobals.begin(), E = ConstGlobals.end(); I != E; ++I) - if (I->second.size() > 1) - Changed |= doMerge(I->second, M, true, I->first); - - return Changed; -} - -bool GlobalMerge::runOnFunction(Function &F) { - return false; -} - -bool GlobalMerge::doFinalization(Module &M) { - MustKeepGlobalVariables.clear(); - return false; -} - -Pass *llvm::createGlobalMergePass(const TargetMachine *TM) { - return new GlobalMerge(TM); -} diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 235aaaa..e83a5c42 100644 --- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -24,23 +24,22 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "indvars" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Type.h" -#include "llvm/Support/CFG.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -50,6 +49,8 @@ #include "llvm/Transforms/Utils/SimplifyIndVar.h" using namespace llvm; +#define DEBUG_TYPE "indvars" + STATISTIC(NumWidened , "Number of indvars widened"); STATISTIC(NumReplaced , "Number of exit values replaced"); STATISTIC(NumLFTR , "Number of loop exit tests replaced"); @@ -63,12 +64,15 @@ static cl::opt<bool> VerifyIndvars( "verify-indvars", cl::Hidden, cl::desc("Verify the ScalarEvolution result after running indvars")); +static cl::opt<bool> ReduceLiveIVs("liv-reduce", cl::Hidden, + cl::desc("Reduce live induction variables.")); + namespace { class IndVarSimplify : public LoopPass { LoopInfo *LI; ScalarEvolution *SE; DominatorTree *DT; - DataLayout *TD; + const DataLayout *DL; TargetLibraryInfo *TLI; SmallVector<WeakVH, 16> DeadInsts; @@ -76,15 +80,15 @@ namespace { public: static char ID; // Pass identification, replacement for typeid - IndVarSimplify() : LoopPass(ID), LI(0), SE(0), DT(0), TD(0), - Changed(false) { + IndVarSimplify() : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), + DL(nullptr), Changed(false) { initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DominatorTree>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfo>(); AU.addRequired<ScalarEvolution>(); AU.addRequiredID(LoopSimplifyID); @@ -96,7 +100,7 @@ namespace { } private: - virtual void releaseMemory() { + void releaseMemory() override { DeadInsts.clear(); } @@ -119,7 +123,7 @@ namespace { char IndVarSimplify::ID = 0; INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars", "Induction Variable Simplification", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) @@ -193,7 +197,7 @@ static Instruction *getInsertPointForUses(Instruction *User, Value *Def, if (!PHI) return User; - Instruction *InsertPt = 0; + Instruction *InsertPt = nullptr; for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) { if (PHI->getIncomingValue(i) != Def) continue; @@ -254,34 +258,34 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { // an add or increment value can not be represented by an integer. BinaryOperator *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); - if (Incr == 0 || Incr->getOpcode() != Instruction::FAdd) return; + if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return; // If this is not an add of the PHI with a constantfp, or if the constant fp // is not an integer, bail out. ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1)); int64_t IncValue; - if (IncValueVal == 0 || Incr->getOperand(0) != PN || + if (IncValueVal == nullptr || Incr->getOperand(0) != PN || !ConvertToSInt(IncValueVal->getValueAPF(), IncValue)) return; // Check Incr uses. One user is PN and the other user is an exit condition // used by the conditional terminator. - Value::use_iterator IncrUse = Incr->use_begin(); + Value::user_iterator IncrUse = Incr->user_begin(); Instruction *U1 = cast<Instruction>(*IncrUse++); - if (IncrUse == Incr->use_end()) return; + if (IncrUse == Incr->user_end()) return; Instruction *U2 = cast<Instruction>(*IncrUse++); - if (IncrUse != Incr->use_end()) return; + if (IncrUse != Incr->user_end()) return; // Find exit condition, which is an fcmp. If it doesn't exist, or if it isn't // only used by a branch, we can't transform it. FCmpInst *Compare = dyn_cast<FCmpInst>(U1); if (!Compare) Compare = dyn_cast<FCmpInst>(U2); - if (Compare == 0 || !Compare->hasOneUse() || - !isa<BranchInst>(Compare->use_back())) + if (!Compare || !Compare->hasOneUse() || + !isa<BranchInst>(Compare->user_back())) return; - BranchInst *TheBr = cast<BranchInst>(Compare->use_back()); + BranchInst *TheBr = cast<BranchInst>(Compare->user_back()); // We need to verify that the branch actually controls the iteration count // of the loop. If not, the new IV can overflow and no one will notice. @@ -298,7 +302,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { // transform it. ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1)); int64_t ExitValue; - if (ExitValueVal == 0 || + if (ExitValueVal == nullptr || !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue)) return; @@ -494,6 +498,21 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { unsigned NumPreds = PN->getNumIncomingValues(); + // We would like to be able to RAUW single-incoming value PHI nodes. We + // have to be certain this is safe even when this is an LCSSA PHI node. + // While the computed exit value is no longer varying in *this* loop, the + // exit block may be an exit block for an outer containing loop as well, + // the exit value may be varying in the outer loop, and thus it may still + // require an LCSSA PHI node. The safe case is when this is + // single-predecessor PHI node (LCSSA) and the exit block containing it is + // part of the enclosing loop, or this is the outer most loop of the nest. + // In either case the exit value could (at most) be varying in the same + // loop body as the phi node itself. Thus if it is in turn used outside of + // an enclosing loop it will only be via a separate LCSSA node. + bool LCSSASafePhiForRAUW = + NumPreds == 1 && + (!L->getParentLoop() || L->getParentLoop() == LI->getLoopFor(ExitBB)); + // Iterate over all of the PHI nodes. BasicBlock::iterator BBI = ExitBB->begin(); while ((PN = dyn_cast<PHINode>(BBI++))) { @@ -545,8 +564,8 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { unsigned NumHardInternalUses = 0; unsigned NumSoftExternalUses = 0; unsigned NumUses = 0; - for (Value::use_iterator IB=Inst->use_begin(), IE=Inst->use_end(); - IB!=IE && NumUses<=6 ; ++IB) { + for (auto IB = Inst->user_begin(), IE = Inst->user_end(); + IB != IE && NumUses <= 6; ++IB) { Instruction *UseInstr = cast<Instruction>(*IB); unsigned Opc = UseInstr->getOpcode(); NumUses++; @@ -558,9 +577,9 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { // Do not count the Phi as a use. LCSSA may have inserted // plenty of trivial ones. NumUses--; - for (Value::use_iterator PB=UseInstr->use_begin(), - PE=UseInstr->use_end(); - PB!=PE && NumUses<=6 ; ++PB, ++NumUses) { + for (auto PB = UseInstr->user_begin(), + PE = UseInstr->user_end(); + PB != PE && NumUses <= 6; ++PB, ++NumUses) { unsigned PhiOpc = cast<Instruction>(*PB)->getOpcode(); if (PhiOpc != Instruction::Call && PhiOpc != Instruction::Ret) NumSoftExternalUses++; @@ -594,17 +613,18 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { if (isInstructionTriviallyDead(Inst, TLI)) DeadInsts.push_back(Inst); - if (NumPreds == 1) { - // Completely replace a single-pred PHI. This is safe, because the - // NewVal won't be variant in the loop, so we don't need an LCSSA phi - // node anymore. + // If we determined that this PHI is safe to replace even if an LCSSA + // PHI, do so. + if (LCSSASafePhiForRAUW) { PN->replaceAllUsesWith(ExitVal); PN->eraseFromParent(); } } - if (NumPreds != 1) { - // Clone the PHI and delete the original one. This lets IVUsers and - // any other maps purge the original user from their records. + + // If we were unable to completely replace the PHI node, clone the PHI + // and delete the original one. This lets IVUsers and any other maps + // purge the original user from their records. + if (!LCSSASafePhiForRAUW) { PHINode *NewPN = cast<PHINode>(PN->clone()); NewPN->takeName(PN); NewPN->insertBefore(PN); @@ -632,36 +652,23 @@ namespace { Type *WidestNativeType; // Widest integer type created [sz]ext bool IsSigned; // Was an sext user seen before a zext? - WideIVInfo() : NarrowIV(0), WidestNativeType(0), IsSigned(false) {} - }; - - class WideIVVisitor : public IVVisitor { - ScalarEvolution *SE; - const DataLayout *TD; - - public: - WideIVInfo WI; - - WideIVVisitor(PHINode *NarrowIV, ScalarEvolution *SCEV, - const DataLayout *TData) : - SE(SCEV), TD(TData) { WI.NarrowIV = NarrowIV; } - - // Implement the interface used by simplifyUsersOfIV. - virtual void visitCast(CastInst *Cast); + WideIVInfo() : NarrowIV(nullptr), WidestNativeType(nullptr), + IsSigned(false) {} }; } /// visitCast - Update information about the induction variable that is /// extended by this sign or zero extend operation. This is used to determine /// the final width of the IV before actually widening it. -void WideIVVisitor::visitCast(CastInst *Cast) { +static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE, + const DataLayout *DL) { bool IsSigned = Cast->getOpcode() == Instruction::SExt; if (!IsSigned && Cast->getOpcode() != Instruction::ZExt) return; Type *Ty = Cast->getType(); uint64_t Width = SE->getTypeSizeInBits(Ty); - if (TD && !TD->isLegalInteger(Width)) + if (DL && !DL->isLegalInteger(Width)) return; if (!WI.WidestNativeType) { @@ -688,7 +695,7 @@ struct NarrowIVDefUse { Instruction *NarrowUse; Instruction *WideDef; - NarrowIVDefUse(): NarrowDef(0), NarrowUse(0), WideDef(0) {} + NarrowIVDefUse(): NarrowDef(nullptr), NarrowUse(nullptr), WideDef(nullptr) {} NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD): NarrowDef(ND), NarrowUse(NU), WideDef(WD) {} @@ -731,9 +738,9 @@ public: L(LI->getLoopFor(OrigPhi->getParent())), SE(SEv), DT(DTree), - WidePhi(0), - WideInc(0), - WideIncExpr(0), + WidePhi(nullptr), + WideInc(nullptr), + WideIncExpr(nullptr), DeadInsts(DI) { assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV"); } @@ -788,7 +795,7 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) { unsigned Opcode = DU.NarrowUse->getOpcode(); switch (Opcode) { default: - return 0; + return nullptr; case Instruction::Add: case Instruction::Mul: case Instruction::UDiv: @@ -833,14 +840,14 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) { const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { // Handle the common case of add<nsw/nuw> if (DU.NarrowUse->getOpcode() != Instruction::Add) - return 0; + return nullptr; // One operand (NarrowDef) has already been extended to WideDef. Now determine // if extending the other will lead to a recurrence. unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0; assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU"); - const SCEV *ExtendOperExpr = 0; + const SCEV *ExtendOperExpr = nullptr; const OverflowingBinaryOperator *OBO = cast<OverflowingBinaryOperator>(DU.NarrowUse); if (IsSigned && OBO->hasNoSignedWrap()) @@ -850,7 +857,7 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { ExtendOperExpr = SE->getZeroExtendExpr( SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType); else - return 0; + return nullptr; // When creating this AddExpr, don't apply the current operations NSW or NUW // flags. This instruction may be guarded by control flow that the no-wrap @@ -861,7 +868,7 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { SE->getAddExpr(SE->getSCEV(DU.WideDef), ExtendOperExpr)); if (!AddRec || AddRec->getLoop() != L) - return 0; + return nullptr; return AddRec; } @@ -872,14 +879,14 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { /// recurrence. Otherwise return NULL. const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) { if (!SE->isSCEVable(NarrowUse->getType())) - return 0; + return nullptr; const SCEV *NarrowExpr = SE->getSCEV(NarrowUse); if (SE->getTypeSizeInBits(NarrowExpr->getType()) >= SE->getTypeSizeInBits(WideType)) { // NarrowUse implicitly widens its operand. e.g. a gep with a narrow // index. So don't follow this use. - return 0; + return nullptr; } const SCEV *WideExpr = IsSigned ? @@ -887,19 +894,47 @@ const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) { SE->getZeroExtendExpr(NarrowExpr, WideType); const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr); if (!AddRec || AddRec->getLoop() != L) - return 0; + return nullptr; return AddRec; } +/// This IV user cannot be widen. Replace this use of the original narrow IV +/// with a truncation of the new wide IV to isolate and eliminate the narrow IV. +static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT) { + DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef + << " for user " << *DU.NarrowUse << "\n"); + IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT)); + Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType()); + DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc); +} + /// WidenIVUse - Determine whether an individual user of the narrow IV can be /// widened. If so, return the wide clone of the user. Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // Stop traversing the def-use chain at inner-loop phis or post-loop phis. - if (isa<PHINode>(DU.NarrowUse) && - LI->getLoopFor(DU.NarrowUse->getParent()) != L) - return 0; - + if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) { + if (LI->getLoopFor(UsePhi->getParent()) != L) { + // For LCSSA phis, sink the truncate outside the loop. + // After SimplifyCFG most loop exit targets have a single predecessor. + // Otherwise fall back to a truncate within the loop. + if (UsePhi->getNumOperands() != 1) + truncateIVUse(DU, DT); + else { + PHINode *WidePhi = + PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide", + UsePhi); + WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0)); + IRBuilder<> Builder(WidePhi->getParent()->getFirstInsertionPt()); + Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType()); + UsePhi->replaceAllUsesWith(Trunc); + DeadInsts.push_back(UsePhi); + DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi + << " to " << *WidePhi << "\n"); + } + return nullptr; + } + } // Our raison d'etre! Eliminate sign and zero extension. if (IsSigned ? isa<SExtInst>(DU.NarrowUse) : isa<ZExtInst>(DU.NarrowUse)) { Value *NewDef = DU.WideDef; @@ -935,7 +970,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // push the uses of WideDef here. // No further widening is needed. The deceased [sz]ext had done it for us. - return 0; + return nullptr; } // Does this user itself evaluate to a recurrence after widening? @@ -947,10 +982,8 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // This user does not evaluate to a recurence after widening, so don't // follow it. Instead insert a Trunc to kill off the original use, // eventually isolating the original narrow IV so it can be removed. - IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT)); - Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType()); - DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc); - return 0; + truncateIVUse(DU, DT); + return nullptr; } // Assume block terminators cannot evaluate to a recurrence. We can't to // insert a Trunc after a terminator if there happens to be a critical edge. @@ -959,14 +992,14 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // Reuse the IV increment that SCEVExpander created as long as it dominates // NarrowUse. - Instruction *WideUse = 0; + Instruction *WideUse = nullptr; if (WideAddRec == WideIncExpr && Rewriter.hoistIVInc(WideInc, DU.NarrowUse)) WideUse = WideInc; else { WideUse = CloneIVUser(DU); if (!WideUse) - return 0; + return nullptr; } // Evaluation of WideAddRec ensured that the narrow expression could be // extended outside the loop without overflow. This suggests that the wide use @@ -977,7 +1010,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec << "\n"); DeadInsts.push_back(WideUse); - return 0; + return nullptr; } // Returning WideUse pushes it on the worklist. @@ -987,15 +1020,14 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { /// pushNarrowIVUsers - Add eligible users of NarrowDef to NarrowIVUsers. /// void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) { - for (Value::use_iterator UI = NarrowDef->use_begin(), - UE = NarrowDef->use_end(); UI != UE; ++UI) { - Instruction *NarrowUse = cast<Instruction>(*UI); + for (User *U : NarrowDef->users()) { + Instruction *NarrowUser = cast<Instruction>(U); // Handle data flow merges and bizarre phi cycles. - if (!Widened.insert(NarrowUse)) + if (!Widened.insert(NarrowUser)) continue; - NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUse, WideDef)); + NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUser, WideDef)); } } @@ -1013,7 +1045,7 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { // Is this phi an induction variable? const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi)); if (!AddRec) - return NULL; + return nullptr; // Widen the induction variable expression. const SCEV *WideIVExpr = IsSigned ? @@ -1026,7 +1058,7 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { // Can the IV be extended outside the loop without overflow? AddRec = dyn_cast<SCEVAddRecExpr>(WideIVExpr); if (!AddRec || AddRec->getLoop() != L) - return NULL; + return nullptr; // An AddRec must have loop-invariant operands. Since this AddRec is // materialized by a loop header phi, the expression cannot have any post-loop @@ -1080,9 +1112,36 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { } //===----------------------------------------------------------------------===// +// Live IV Reduction - Minimize IVs live across the loop. +//===----------------------------------------------------------------------===// + + +//===----------------------------------------------------------------------===// // Simplification of IV users based on SCEV evaluation. //===----------------------------------------------------------------------===// +namespace { + class IndVarSimplifyVisitor : public IVVisitor { + ScalarEvolution *SE; + const DataLayout *DL; + PHINode *IVPhi; + + public: + WideIVInfo WI; + + IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, + const DataLayout *DL, const DominatorTree *DTree): + SE(SCEV), DL(DL), IVPhi(IV) { + DT = DTree; + WI.NarrowIV = IVPhi; + if (ReduceLiveIVs) + setSplitOverflowIntrinsics(); + } + + // Implement the interface used by simplifyUsersOfIV. + void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, DL); } + }; +} /// SimplifyAndExtend - Iteratively perform simplification on a worklist of IV /// users. Each successive simplification may push more users which may @@ -1114,12 +1173,12 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L, PHINode *CurrIV = LoopPhis.pop_back_val(); // Information about sign/zero extensions of CurrIV. - WideIVVisitor WIV(CurrIV, SE, TD); + IndVarSimplifyVisitor Visitor(CurrIV, SE, DL, DT); - Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &WIV); + Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &Visitor); - if (WIV.WI.WidestNativeType) { - WideIVs.push_back(WIV.WI); + if (Visitor.WI.WidestNativeType) { + WideIVs.push_back(Visitor.WI); } } while(!LoopPhis.empty()); @@ -1225,7 +1284,7 @@ static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE) { static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) { Instruction *IncI = dyn_cast<Instruction>(IncV); if (!IncI) - return 0; + return nullptr; switch (IncI->getOpcode()) { case Instruction::Add: @@ -1236,17 +1295,17 @@ static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) { if (IncI->getNumOperands() == 2) break; default: - return 0; + return nullptr; } PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0)); if (Phi && Phi->getParent() == L->getHeader()) { if (isLoopInvariant(IncI->getOperand(1), L, DT)) return Phi; - return 0; + return nullptr; } if (IncI->getOpcode() == Instruction::GetElementPtr) - return 0; + return nullptr; // Allow add/sub to be commuted. Phi = dyn_cast<PHINode>(IncI->getOperand(1)); @@ -1254,7 +1313,7 @@ static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) { if (isLoopInvariant(IncI->getOperand(0), L, DT)) return Phi; } - return 0; + return nullptr; } /// Return the compare guarding the loop latch, or NULL for unrecognized tests. @@ -1264,7 +1323,7 @@ static ICmpInst *getLoopTest(Loop *L) { BasicBlock *LatchBlock = L->getLoopLatch(); // Don't bother with LFTR if the loop is not properly simplified. if (!LatchBlock) - return 0; + return nullptr; BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator()); assert(BI && "expected exit branch"); @@ -1359,15 +1418,11 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { int LatchIdx = Phi->getBasicBlockIndex(LatchBlock); Value *IncV = Phi->getIncomingValue(LatchIdx); - for (Value::use_iterator UI = Phi->use_begin(), UE = Phi->use_end(); - UI != UE; ++UI) { - if (*UI != Cond && *UI != IncV) return false; - } + for (User *U : Phi->users()) + if (U != Cond && U != IncV) return false; - for (Value::use_iterator UI = IncV->use_begin(), UE = IncV->use_end(); - UI != UE; ++UI) { - if (*UI != Cond && *UI != Phi) return false; - } + for (User *U : IncV->users()) + if (U != Cond && U != Phi) return false; return true; } @@ -1386,15 +1441,15 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { /// could at least handle constant BECounts. static PHINode * FindLoopCounter(Loop *L, const SCEV *BECount, - ScalarEvolution *SE, DominatorTree *DT, const DataLayout *TD) { + ScalarEvolution *SE, DominatorTree *DT, const DataLayout *DL) { uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType()); Value *Cond = cast<BranchInst>(L->getExitingBlock()->getTerminator())->getCondition(); // Loop over all of the PHI nodes, looking for a simple counter. - PHINode *BestPhi = 0; - const SCEV *BestInit = 0; + PHINode *BestPhi = nullptr; + const SCEV *BestInit = nullptr; BasicBlock *LatchBlock = L->getLoopLatch(); assert(LatchBlock && "needsLFTR should guarantee a loop latch"); @@ -1415,7 +1470,7 @@ FindLoopCounter(Loop *L, const SCEV *BECount, // AR may be wider than BECount. With eq/ne tests overflow is immaterial. // AR may not be a narrower type, or we may never exit. uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType()); - if (PhiWidth < BCWidth || (TD && !TD->isLegalInteger(PhiWidth))) + if (PhiWidth < BCWidth || (DL && !DL->isLegalInteger(PhiWidth))) continue; const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)); @@ -1518,7 +1573,7 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, // IVInit integer and IVCount pointer would only occur if a canonical IV // were generated on top of case #2, which is not expected. - const SCEV *IVLimit = 0; + const SCEV *IVLimit = nullptr; // For unit stride, IVCount = Start + BECount with 2's complement overflow. // For non-zero Start, compute IVCount here. if (AR->getStart()->isZero()) @@ -1697,13 +1752,12 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) { // Determine if there is a use in or before the loop (direct or // otherwise). bool UsedInLoop = false; - for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); - UI != UE; ++UI) { - User *U = *UI; - BasicBlock *UseBB = cast<Instruction>(U)->getParent(); - if (PHINode *P = dyn_cast<PHINode>(U)) { + for (Use &U : I->uses()) { + Instruction *User = cast<Instruction>(U.getUser()); + BasicBlock *UseBB = User->getParent(); + if (PHINode *P = dyn_cast<PHINode>(User)) { unsigned i = - PHINode::getIncomingValueNumForOperand(UI.getOperandNo()); + PHINode::getIncomingValueNumForOperand(U.getOperandNo()); UseBB = P->getIncomingBlock(i); } if (UseBB == Preheader || L->contains(UseBB)) { @@ -1743,6 +1797,9 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) { //===----------------------------------------------------------------------===// bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) + return false; + // If LoopSimplify form is not available, stay out of trouble. Some notes: // - LSR currently only supports LoopSimplify-form loops. Indvars' // canonicalization can be a pessimization without LSR to "clean up" @@ -1756,8 +1813,9 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { LI = &getAnalysis<LoopInfo>(); SE = &getAnalysis<ScalarEvolution>(); - DT = &getAnalysis<DominatorTree>(); - TD = getAnalysisIfAvailable<DataLayout>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); DeadInsts.clear(); @@ -1799,13 +1857,13 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // If we have a trip count expression, rewrite the loop's exit condition // using it. We can currently only handle loops with a single exit. if (canExpandBackedgeTakenCount(L, SE) && needsLFTR(L, DT)) { - PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, TD); + PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, DL); if (IndVar) { // Check preconditions for proper SCEVExpander operation. SCEV does not // express SCEVExpander's dependencies, such as LoopSimplify. Instead any // pass that uses the SCEVExpander must do it. This does not work well for - // loop passes because SCEVExpander makes assumptions about all loops, while - // LoopPassManager only forces the current loop to be simplified. + // loop passes because SCEVExpander makes assumptions about all loops, + // while LoopPassManager only forces the current loop to be simplified. // // FIXME: SCEV expansion has no way to bail out, so the caller must // explicitly check any assumptions made by SCEV. Brittle. diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp index b3ec2fc..21f80385 100644 --- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -11,7 +11,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "jump-threading" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" @@ -27,10 +26,10 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -38,6 +37,8 @@ #include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; +#define DEBUG_TYPE "jump-threading" + STATISTIC(NumThreads, "Number of jumps threaded"); STATISTIC(NumFolds, "Number of terminators folded"); STATISTIC(NumDupes, "Number of branch blocks duplicated to eliminate phi"); @@ -76,7 +77,7 @@ namespace { /// revectored to the false side of the second if. /// class JumpThreading : public FunctionPass { - DataLayout *TD; + const DataLayout *DL; TargetLibraryInfo *TLI; LazyValueInfo *LVI; #ifdef NDEBUG @@ -105,9 +106,9 @@ namespace { initializeJumpThreadingPass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LazyValueInfo>(); AU.addPreserved<LazyValueInfo>(); AU.addRequired<TargetLibraryInfo>(); @@ -148,11 +149,24 @@ FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); } /// runOnFunction - Top level algorithm. /// bool JumpThreading::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); - TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = &getAnalysis<TargetLibraryInfo>(); LVI = &getAnalysis<LazyValueInfo>(); + // Remove unreachable blocks from function as they may result in infinite + // loop. We do threading if we found something profitable. Jump threading a + // branch can create other opportunities. If these opportunities form a cycle + // i.e. if any jump treading is undoing previous threading in the path, then + // we will loop forever. We take care of this issue by not jump threading for + // back edges. This works for normal cases but not for unreachable blocks as + // they may have cycle with no back edge. + removeUnreachableBlocks(F); + FindLoopHeaders(F); bool Changed, EverChanged = false; @@ -251,7 +265,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, // as having cost of 2 total, and if they are a vector intrinsic, we model // them as having cost 1. if (const CallInst *CI = dyn_cast<CallInst>(I)) { - if (CI->hasFnAttr(Attribute::NoDuplicate)) + if (CI->cannotDuplicate()) // Blocks with NoDuplicate are modelled as having infinite cost, so they // are never duplicated. return ~0U; @@ -304,7 +318,7 @@ void JumpThreading::FindLoopHeaders(Function &F) { /// Returns null if Val is null or not an appropriate constant. static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) { if (!Val) - return 0; + return nullptr; // Undef is "known" enough. if (UndefValue *U = dyn_cast<UndefValue>(Val)) @@ -348,7 +362,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, // If V is a non-instruction value, or an instruction in a different block, // then it can't be derived from a PHI. Instruction *I = dyn_cast<Instruction>(V); - if (I == 0 || I->getParent() != BB) { + if (!I || I->getParent() != BB) { // Okay, if this is a live-in value, see if it has a known value at the end // of any of our predecessors. @@ -490,8 +504,8 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, Value *LHS = PN->getIncomingValue(i); Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB); - Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, TD); - if (Res == 0) { + Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, DL); + if (!Res) { if (!isa<Constant>(RHS)) continue; @@ -577,7 +591,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, // Either operand will do, so be sure to pick the one that's a known // constant. // FIXME: Do this more cleverly if both values are known constants? - KnownCond = (TrueVal != 0); + KnownCond = (TrueVal != nullptr); } // See if the select has a known constant value for this predecessor. @@ -655,14 +669,9 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { if (LoopHeaders.erase(SinglePred)) LoopHeaders.insert(BB); - // Remember if SinglePred was the entry block of the function. If so, we - // will need to move BB back to the entry position. - bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); LVI->eraseBlock(SinglePred); MergeBasicBlockIntoOnlyPred(BB); - if (isEntry && BB != &BB->getParent()->getEntryBlock()) - BB->moveBefore(&BB->getParent()->getEntryBlock()); return true; } } @@ -692,7 +701,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { // Run constant folding to see if we can reduce the condition to a simple // constant. if (Instruction *I = dyn_cast<Instruction>(Condition)) { - Value *SimpleVal = ConstantFoldInstruction(I, TD, TLI); + Value *SimpleVal = ConstantFoldInstruction(I, DL, TLI); if (SimpleVal) { I->replaceAllUsesWith(SimpleVal); I->eraseFromParent(); @@ -733,7 +742,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { Instruction *CondInst = dyn_cast<Instruction>(Condition); // All the rest of our checks depend on the condition being an instruction. - if (CondInst == 0) { + if (!CondInst) { // FIXME: Unify this with code below. if (ProcessThreadableEdges(Condition, BB, Preference)) return true; @@ -886,7 +895,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { SmallPtrSet<BasicBlock*, 8> PredsScanned; typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy; AvailablePredsTy AvailablePreds; - BasicBlock *OneUnavailablePred = 0; + BasicBlock *OneUnavailablePred = nullptr; // If we got here, the loaded value is transparent through to the start of the // block. Check to see if it is available in any of the predecessor blocks. @@ -900,16 +909,16 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Scan the predecessor to see if the value is available in the pred. BBIt = PredBB->end(); - MDNode *ThisTBAATag = 0; + MDNode *ThisTBAATag = nullptr; Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6, - 0, &ThisTBAATag); + nullptr, &ThisTBAATag); if (!PredAvailable) { OneUnavailablePred = PredBB; continue; } // If tbaa tags disagree or are not present, forget about them. - if (TBAATag != ThisTBAATag) TBAATag = 0; + if (TBAATag != ThisTBAATag) TBAATag = nullptr; // If so, this load is partially redundant. Remember this info so that we // can create a PHI node. @@ -925,7 +934,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // predecessor, we want to insert a merge block for those common predecessors. // This ensures that we only have to insert one reload, thus not increasing // code size. - BasicBlock *UnavailablePred = 0; + BasicBlock *UnavailablePred = nullptr; // If there is exactly one predecessor where the value is unavailable, the // already computed 'OneUnavailablePred' block is it. If it ends in an @@ -992,7 +1001,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { BasicBlock *P = *PI; AvailablePredsTy::iterator I = std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(), - std::make_pair(P, (Value*)0)); + std::make_pair(P, (Value*)nullptr)); assert(I != AvailablePreds.end() && I->first == P && "Didn't find entry for predecessor!"); @@ -1099,7 +1108,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, SmallPtrSet<BasicBlock*, 16> SeenPreds; SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList; - BasicBlock *OnlyDest = 0; + BasicBlock *OnlyDest = nullptr; BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL; for (unsigned i = 0, e = PredValues.size(); i != e; ++i) { @@ -1116,7 +1125,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, BasicBlock *DestBB; if (isa<UndefValue>(Val)) - DestBB = 0; + DestBB = nullptr; else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero()); else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { @@ -1167,7 +1176,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, // If the threadable edges are branching on an undefined value, we get to pick // the destination that these predecessors should get to. - if (MostPopularDest == 0) + if (!MostPopularDest) MostPopularDest = BB->getTerminator()-> getSuccessor(GetBestDestForJumpOnUndef(BB)); @@ -1269,7 +1278,7 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { } // Determine which value to split on, true, false, or undef if neither. - ConstantInt *SplitVal = 0; + ConstantInt *SplitVal = nullptr; if (NumTrue > NumFalse) SplitVal = ConstantInt::getTrue(BB->getContext()); else if (NumTrue != 0 || NumFalse != 0) @@ -1290,7 +1299,7 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { // help us. However, we can just replace the LHS or RHS with the constant. if (BlocksToFoldInto.size() == cast<PHINode>(BB->front()).getNumIncomingValues()) { - if (SplitVal == 0) { + if (!SplitVal) { // If all preds provide undef, just nuke the xor, because it is undef too. BO->replaceAllUsesWith(UndefValue::get(BO->getType())); BO->eraseFromParent(); @@ -1431,16 +1440,15 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) { // Scan all uses of this instruction to see if it is used outside of its // block, and if so, record them in UsesToRename. - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E; - ++UI) { - Instruction *User = cast<Instruction>(*UI); + for (Use &U : I->uses()) { + Instruction *User = cast<Instruction>(U.getUser()); if (PHINode *UserPN = dyn_cast<PHINode>(User)) { - if (UserPN->getIncomingBlock(UI) == BB) + if (UserPN->getIncomingBlock(U) == BB) continue; } else if (User->getParent() == BB) continue; - UsesToRename.push_back(&UI.getUse()); + UsesToRename.push_back(&U); } // If there are no uses outside the block, we're done with this instruction. @@ -1475,7 +1483,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // At this point, the IR is fully up to date and consistent. Do a quick scan // over the new instructions and zap any that are constants or dead. This // frequently happens because of phi translation. - SimplifyInstructionsInBlock(NewBB, TD, TLI); + SimplifyInstructionsInBlock(NewBB, DL, TLI); // Threaded an edge! ++NumThreads; @@ -1528,7 +1536,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, // can just clone the bits from BB into the end of the new PredBB. BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator()); - if (OldPredBranch == 0 || !OldPredBranch->isUnconditional()) { + if (!OldPredBranch || !OldPredBranch->isUnconditional()) { PredBB = SplitEdge(PredBB, BB, this); OldPredBranch = cast<BranchInst>(PredBB->getTerminator()); } @@ -1557,7 +1565,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, // If this instruction can be simplified after the operands are updated, // just use the simplified value instead. This frequently happens due to // phi translation. - if (Value *IV = SimplifyInstruction(New, TD)) { + if (Value *IV = SimplifyInstruction(New, DL)) { delete New; ValueMapping[BI] = IV; } else { @@ -1585,16 +1593,15 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) { // Scan all uses of this instruction to see if it is used outside of its // block, and if so, record them in UsesToRename. - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E; - ++UI) { - Instruction *User = cast<Instruction>(*UI); + for (Use &U : I->uses()) { + Instruction *User = cast<Instruction>(U.getUser()); if (PHINode *UserPN = dyn_cast<PHINode>(User)) { - if (UserPN->getIncomingBlock(UI) == BB) + if (UserPN->getIncomingBlock(U) == BB) continue; } else if (User->getParent() == BB) continue; - UsesToRename.push_back(&UI.getUse()); + UsesToRename.push_back(&U); } // If there are no uses outside the block, we're done with this instruction. diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp index f94cd2a..abcceb2 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp @@ -30,33 +30,37 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "licm" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" -#include "llvm/Support/CFG.h" +#include "llvm/IR/PredIteratorCache.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include <algorithm> using namespace llvm; +#define DEBUG_TYPE "licm" + STATISTIC(NumSunk , "Number of instructions sunk out of loop"); STATISTIC(NumHoisted , "Number of instructions hoisted out of loop"); STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk"); @@ -74,26 +78,28 @@ namespace { initializeLICMPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; /// This transformation requires natural loop information & requires that /// loop preheaders be inserted into the CFG... /// - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfo>(); AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); AU.addRequired<AliasAnalysis>(); AU.addPreserved<AliasAnalysis>(); - AU.addPreserved("scalar-evolution"); - AU.addPreservedID(LoopSimplifyID); + AU.addPreserved<ScalarEvolution>(); AU.addRequired<TargetLibraryInfo>(); } using llvm::Pass::doFinalization; - bool doFinalization() { + bool doFinalization() override { assert(LoopToAliasSetMap.empty() && "Didn't free loop alias sets"); return false; } @@ -103,7 +109,7 @@ namespace { LoopInfo *LI; // Current LoopInfo DominatorTree *DT; // Dominator Tree for the current Loop. - DataLayout *TD; // DataLayout for constant folding. + const DataLayout *DL; // DataLayout for constant folding. TargetLibraryInfo *TLI; // TargetLibraryInfo for constant folding. // State that is updated as we process loops. @@ -117,11 +123,12 @@ namespace { DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap; /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info. - void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L); + void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, + Loop *L) override; /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias /// set. - void deleteAnalysisValue(Value *V, Loop *L); + void deleteAnalysisValue(Value *V, Loop *L) override; /// SinkRegion - Walk the specified region of the CFG (defined by all blocks /// dominated by the specified block, and that are in the current loop) in @@ -183,15 +190,26 @@ namespace { void PromoteAliasSet(AliasSet &AS, SmallVectorImpl<BasicBlock*> &ExitBlocks, - SmallVectorImpl<Instruction*> &InsertPts); + SmallVectorImpl<Instruction*> &InsertPts, + PredIteratorCache &PIC); + + /// \brief Create a copy of the instruction in the exit block and patch up + /// SSA. + /// PN is a user of I in ExitBlock that can be used to get the number and + /// list of predecessors fast. + Instruction *CloneInstructionInExitBlock(Instruction &I, + BasicBlock &ExitBlock, + PHINode &PN); }; } char LICM::ID = 0; INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false) @@ -203,16 +221,22 @@ Pass *llvm::createLICMPass() { return new LICM(); } /// times on one loop. /// bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) + return false; + Changed = false; // Get our Loop and Alias Analysis information... LI = &getAnalysis<LoopInfo>(); AA = &getAnalysis<AliasAnalysis>(); - DT = &getAnalysis<DominatorTree>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = &getAnalysis<TargetLibraryInfo>(); + assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); + CurAST = new AliasSetTracker(*AA); // Collect Alias info from subloops. for (Loop::iterator LoopItr = L->begin(), LoopItrE = L->end(); @@ -272,19 +296,36 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // Now that all loop invariants have been removed from the loop, promote any // memory references to scalars that we can. - if (!DisablePromotion && Preheader && L->hasDedicatedExits()) { + if (!DisablePromotion && (Preheader || L->hasDedicatedExits())) { SmallVector<BasicBlock *, 8> ExitBlocks; SmallVector<Instruction *, 8> InsertPts; + PredIteratorCache PIC; // Loop over all of the alias sets in the tracker object. for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end(); I != E; ++I) - PromoteAliasSet(*I, ExitBlocks, InsertPts); + PromoteAliasSet(*I, ExitBlocks, InsertPts, PIC); + + // Once we have promoted values across the loop body we have to recursively + // reform LCSSA as any nested loop may now have values defined within the + // loop used in the outer loop. + // FIXME: This is really heavy handed. It would be a bit better to use an + // SSAUpdater strategy during promotion that was LCSSA aware and reformed + // it as it went. + if (Changed) + formLCSSARecursively(*L, *DT, getAnalysisIfAvailable<ScalarEvolution>()); } + // Check that neither this loop nor its parent have had LCSSA broken. LICM is + // specifically moving instructions across the loop boundary and so it is + // especially in need of sanity checking here. + assert(L->isLCSSAForm(*DT) && "Loop not left in LCSSA form after LICM!"); + assert((!L->getParentLoop() || L->getParentLoop()->isLCSSAForm(*DT)) && + "Parent loop not left in LCSSA form after LICM!"); + // Clear out loops state information for the next iteration - CurLoop = 0; - Preheader = 0; + CurLoop = nullptr; + Preheader = nullptr; // If this loop is nested inside of another one, save the alias information // for when we process the outer loop. @@ -302,7 +343,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { /// iteration. /// void LICM::SinkRegion(DomTreeNode *N) { - assert(N != 0 && "Null dominator tree node?"); + assert(N != nullptr && "Null dominator tree node?"); BasicBlock *BB = N->getBlock(); // If this subregion is not in the top level loop at all, exit. @@ -349,7 +390,7 @@ void LICM::SinkRegion(DomTreeNode *N) { /// before uses, allowing us to hoist a loop body in one pass without iteration. /// void LICM::HoistRegion(DomTreeNode *N) { - assert(N != 0 && "Null dominator tree node?"); + assert(N != nullptr && "Null dominator tree node?"); BasicBlock *BB = N->getBlock(); // If this subregion is not in the top level loop at all, exit. @@ -364,7 +405,7 @@ void LICM::HoistRegion(DomTreeNode *N) { // Try constant folding this instruction. If all the operands are // constants, it is technically hoistable, but it would be better to just // fold it. - if (Constant *C = ConstantFoldInstruction(&I, TD, TLI)) { + if (Constant *C = ConstantFoldInstruction(&I, DL, TLI)) { DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n'); CurAST->copyValue(&I, C); CurAST->deleteValue(&I); @@ -450,26 +491,82 @@ bool LICM::canSinkOrHoistInst(Instruction &I) { return isSafeToExecuteUnconditionally(I); } +/// \brief Returns true if a PHINode is a trivially replaceable with an +/// Instruction. +/// +/// This is true when all incoming values are that instruction. This pattern +/// occurs most often with LCSSA PHI nodes. +static bool isTriviallyReplacablePHI(PHINode &PN, Instruction &I) { + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) + if (PN.getIncomingValue(i) != &I) + return false; + + return true; +} + /// isNotUsedInLoop - Return true if the only users of this instruction are /// outside of the loop. If this is true, we can sink the instruction to the /// exit blocks of the loop. /// bool LICM::isNotUsedInLoop(Instruction &I) { - for (Value::use_iterator UI = I.use_begin(), E = I.use_end(); UI != E; ++UI) { - Instruction *User = cast<Instruction>(*UI); - if (PHINode *PN = dyn_cast<PHINode>(User)) { - // PHI node uses occur in predecessor blocks! + for (User *U : I.users()) { + Instruction *UI = cast<Instruction>(U); + if (PHINode *PN = dyn_cast<PHINode>(UI)) { + // A PHI node where all of the incoming values are this instruction are + // special -- they can just be RAUW'ed with the instruction and thus + // don't require a use in the predecessor. This is a particular important + // special case because it is the pattern found in LCSSA form. + if (isTriviallyReplacablePHI(*PN, I)) { + if (CurLoop->contains(PN)) + return false; + else + continue; + } + + // Otherwise, PHI node uses occur in predecessor blocks if the incoming + // values. Check for such a use being inside the loop. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) if (PN->getIncomingValue(i) == &I) if (CurLoop->contains(PN->getIncomingBlock(i))) return false; - } else if (CurLoop->contains(User)) { - return false; + + continue; } + + if (CurLoop->contains(UI)) + return false; } return true; } +Instruction *LICM::CloneInstructionInExitBlock(Instruction &I, + BasicBlock &ExitBlock, + PHINode &PN) { + Instruction *New = I.clone(); + ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New); + if (!I.getName().empty()) New->setName(I.getName() + ".le"); + + // Build LCSSA PHI nodes for any in-loop operands. Note that this is + // particularly cheap because we can rip off the PHI node that we're + // replacing for the number and blocks of the predecessors. + // OPT: If this shows up in a profile, we can instead finish sinking all + // invariant instructions, and then walk their operands to re-establish + // LCSSA. That will eliminate creating PHI nodes just to nuke them when + // sinking bottom-up. + for (User::op_iterator OI = New->op_begin(), OE = New->op_end(); OI != OE; + ++OI) + if (Instruction *OInst = dyn_cast<Instruction>(*OI)) + if (Loop *OLoop = LI->getLoopFor(OInst->getParent())) + if (!OLoop->contains(&PN)) { + PHINode *OpPN = + PHINode::Create(OInst->getType(), PN.getNumIncomingValues(), + OInst->getName() + ".lcssa", ExitBlock.begin()); + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) + OpPN->addIncoming(OInst, PN.getIncomingBlock(i)); + *OI = OpPN; + } + return New; +} /// sink - When an instruction is found to only be used outside of the loop, /// this function moves it to the exit blocks and patches up SSA form as needed. @@ -479,119 +576,45 @@ bool LICM::isNotUsedInLoop(Instruction &I) { void LICM::sink(Instruction &I) { DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); - SmallVector<BasicBlock*, 8> ExitBlocks; - CurLoop->getUniqueExitBlocks(ExitBlocks); - if (isa<LoadInst>(I)) ++NumMovedLoads; else if (isa<CallInst>(I)) ++NumMovedCalls; ++NumSunk; Changed = true; - // The case where there is only a single exit node of this loop is common - // enough that we handle it as a special (more efficient) case. It is more - // efficient to handle because there are no PHI nodes that need to be placed. - if (ExitBlocks.size() == 1) { - if (!DT->dominates(I.getParent(), ExitBlocks[0])) { - // Instruction is not used, just delete it. - CurAST->deleteValue(&I); - // If I has users in unreachable blocks, eliminate. - // If I is not void type then replaceAllUsesWith undef. - // This allows ValueHandlers and custom metadata to adjust itself. - if (!I.use_empty()) - I.replaceAllUsesWith(UndefValue::get(I.getType())); - I.eraseFromParent(); - } else { - // Move the instruction to the start of the exit block, after any PHI - // nodes in it. - I.moveBefore(ExitBlocks[0]->getFirstInsertionPt()); - - // This instruction is no longer in the AST for the current loop, because - // we just sunk it out of the loop. If we just sunk it into an outer - // loop, we will rediscover the operation when we process it. - CurAST->deleteValue(&I); - } - return; - } - - if (ExitBlocks.empty()) { - // The instruction is actually dead if there ARE NO exit blocks. - CurAST->deleteValue(&I); - // If I has users in unreachable blocks, eliminate. - // If I is not void type then replaceAllUsesWith undef. - // This allows ValueHandlers and custom metadata to adjust itself. - if (!I.use_empty()) - I.replaceAllUsesWith(UndefValue::get(I.getType())); - I.eraseFromParent(); - return; - } - - // Otherwise, if we have multiple exits, use the SSAUpdater to do all of the - // hard work of inserting PHI nodes as necessary. - SmallVector<PHINode*, 8> NewPHIs; - SSAUpdater SSA(&NewPHIs); - - if (!I.use_empty()) - SSA.Initialize(I.getType(), I.getName()); - - // Insert a copy of the instruction in each exit block of the loop that is - // dominated by the instruction. Each exit block is known to only be in the - // ExitBlocks list once. - BasicBlock *InstOrigBB = I.getParent(); - unsigned NumInserted = 0; +#ifndef NDEBUG + SmallVector<BasicBlock *, 32> ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); + SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end()); +#endif - for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { - BasicBlock *ExitBlock = ExitBlocks[i]; + // Clones of this instruction. Don't create more than one per exit block! + SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies; - if (!DT->dominates(InstOrigBB, ExitBlock)) - continue; + // If this instruction is only used outside of the loop, then all users are + // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of + // the instruction. + while (!I.use_empty()) { + // The user must be a PHI node. + PHINode *PN = cast<PHINode>(I.user_back()); - // Insert the code after the last PHI node. - BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt(); + BasicBlock *ExitBlock = PN->getParent(); + assert(ExitBlockSet.count(ExitBlock) && + "The LCSSA PHI is not in an exit block!"); - // If this is the first exit block processed, just move the original - // instruction, otherwise clone the original instruction and insert - // the copy. Instruction *New; - if (NumInserted++ == 0) { - I.moveBefore(InsertPt); - New = &I; - } else { - New = I.clone(); - if (!I.getName().empty()) - New->setName(I.getName()+".le"); - ExitBlock->getInstList().insert(InsertPt, New); - } - - // Now that we have inserted the instruction, inform SSAUpdater. - if (!I.use_empty()) - SSA.AddAvailableValue(ExitBlock, New); - } - - // If the instruction doesn't dominate any exit blocks, it must be dead. - if (NumInserted == 0) { - CurAST->deleteValue(&I); - if (!I.use_empty()) - I.replaceAllUsesWith(UndefValue::get(I.getType())); - I.eraseFromParent(); - return; - } - - // Next, rewrite uses of the instruction, inserting PHI nodes as needed. - for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; ) { - // Grab the use before incrementing the iterator. - Use &U = UI.getUse(); - // Increment the iterator before removing the use from the list. - ++UI; - SSA.RewriteUseAfterInsertions(U); + auto It = SunkCopies.find(ExitBlock); + if (It != SunkCopies.end()) + New = It->second; + else + New = SunkCopies[ExitBlock] = + CloneInstructionInExitBlock(I, *ExitBlock, *PN); + + PN->replaceAllUsesWith(New); + PN->eraseFromParent(); } - // Update CurAST for NewPHIs if I had pointer type. - if (I.getType()->isPointerTy()) - for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) - CurAST->copyValue(&I, NewPHIs[i]); - - // Finally, remove the instruction from CurAST. It is no longer in the loop. CurAST->deleteValue(&I); + I.eraseFromParent(); } /// hoist - When an instruction is found to only use loop invariant operands @@ -616,7 +639,7 @@ void LICM::hoist(Instruction &I) { /// bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) { // If it is not a trapping instruction, it is always safe to hoist. - if (isSafeToSpeculativelyExecute(&Inst)) + if (isSafeToSpeculativelyExecute(&Inst, DL)) return true; return isGuaranteedToExecute(Inst); @@ -662,24 +685,42 @@ namespace { SmallPtrSet<Value*, 4> &PointerMustAliases; SmallVectorImpl<BasicBlock*> &LoopExitBlocks; SmallVectorImpl<Instruction*> &LoopInsertPts; + PredIteratorCache &PredCache; AliasSetTracker &AST; + LoopInfo &LI; DebugLoc DL; int Alignment; MDNode *TBAATag; + + Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const { + if (Instruction *I = dyn_cast<Instruction>(V)) + if (Loop *L = LI.getLoopFor(I->getParent())) + if (!L->contains(BB)) { + // We need to create an LCSSA PHI node for the incoming value and + // store that. + PHINode *PN = PHINode::Create( + I->getType(), PredCache.GetNumPreds(BB), + I->getName() + ".lcssa", BB->begin()); + for (BasicBlock **PI = PredCache.GetPreds(BB); *PI; ++PI) + PN->addIncoming(I, *PI); + return PN; + } + return V; + } + public: - LoopPromoter(Value *SP, - const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, - SmallPtrSet<Value*, 4> &PMA, - SmallVectorImpl<BasicBlock*> &LEB, - SmallVectorImpl<Instruction*> &LIP, - AliasSetTracker &ast, DebugLoc dl, int alignment, + LoopPromoter(Value *SP, const SmallVectorImpl<Instruction *> &Insts, + SSAUpdater &S, SmallPtrSet<Value *, 4> &PMA, + SmallVectorImpl<BasicBlock *> &LEB, + SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC, + AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment, MDNode *TBAATag) - : LoadAndStorePromoter(Insts, S), SomePtr(SP), - PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP), - AST(ast), DL(dl), Alignment(alignment), TBAATag(TBAATag) {} + : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), + LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast), + LI(li), DL(dl), Alignment(alignment), TBAATag(TBAATag) {} - virtual bool isInstInList(Instruction *I, - const SmallVectorImpl<Instruction*> &) const { + bool isInstInList(Instruction *I, + const SmallVectorImpl<Instruction*> &) const override { Value *Ptr; if (LoadInst *LI = dyn_cast<LoadInst>(I)) Ptr = LI->getOperand(0); @@ -688,7 +729,7 @@ namespace { return PointerMustAliases.count(Ptr); } - virtual void doExtraRewritesBeforeFinalDeletion() const { + void doExtraRewritesBeforeFinalDeletion() const override { // Insert stores after in the loop exit blocks. Each exit block gets a // store of the live-out values that feed them. Since we've already told // the SSA updater about the defs in the loop and the preheader @@ -696,19 +737,21 @@ namespace { for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) { BasicBlock *ExitBlock = LoopExitBlocks[i]; Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock); + LiveInValue = maybeInsertLCSSAPHI(LiveInValue, ExitBlock); + Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock); Instruction *InsertPos = LoopInsertPts[i]; - StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos); + StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos); NewSI->setAlignment(Alignment); NewSI->setDebugLoc(DL); if (TBAATag) NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag); } } - virtual void replaceLoadWithValue(LoadInst *LI, Value *V) const { + void replaceLoadWithValue(LoadInst *LI, Value *V) const override { // Update alias analysis. AST.copyValue(LI, V); } - virtual void instructionDeleted(Instruction *I) const { + void instructionDeleted(Instruction *I) const override { AST.deleteValue(I); } }; @@ -721,7 +764,8 @@ namespace { /// void LICM::PromoteAliasSet(AliasSet &AS, SmallVectorImpl<BasicBlock*> &ExitBlocks, - SmallVectorImpl<Instruction*> &InsertPts) { + SmallVectorImpl<Instruction*> &InsertPts, + PredIteratorCache &PIC) { // We can promote this alias set if it has a store, if it is a "Must" alias // set, if the pointer is loop invariant, and if we are not eliminating any // volatile loads or stores. @@ -754,7 +798,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, // We start with an alignment of one and try to find instructions that allow // us to prove better alignment. unsigned Alignment = 1; - MDNode *TBAATag = 0; + MDNode *TBAATag = nullptr; // Check that all of the pointers in the alias set have the same type. We // cannot (yet) promote a memory location that is loaded and stored in @@ -769,23 +813,22 @@ void LICM::PromoteAliasSet(AliasSet &AS, if (SomePtr->getType() != ASIV->getType()) return; - for (Value::use_iterator UI = ASIV->use_begin(), UE = ASIV->use_end(); - UI != UE; ++UI) { + for (User *U : ASIV->users()) { // Ignore instructions that are outside the loop. - Instruction *Use = dyn_cast<Instruction>(*UI); - if (!Use || !CurLoop->contains(Use)) + Instruction *UI = dyn_cast<Instruction>(U); + if (!UI || !CurLoop->contains(UI)) continue; // If there is an non-load/store instruction in the loop, we can't promote // it. - if (LoadInst *load = dyn_cast<LoadInst>(Use)) { + if (LoadInst *load = dyn_cast<LoadInst>(UI)) { assert(!load->isVolatile() && "AST broken"); if (!load->isSimple()) return; - } else if (StoreInst *store = dyn_cast<StoreInst>(Use)) { + } else if (StoreInst *store = dyn_cast<StoreInst>(UI)) { // Stores *of* the pointer are not interesting, only stores *to* the // pointer. - if (Use->getOperand(1) != ASIV) + if (UI->getOperand(1) != ASIV) continue; assert(!store->isVolatile() && "AST broken"); if (!store->isSimple()) @@ -801,13 +844,13 @@ void LICM::PromoteAliasSet(AliasSet &AS, // Larger is better, with the exception of 0 being the best alignment. unsigned InstAlignment = store->getAlignment(); if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0) - if (isGuaranteedToExecute(*Use)) { + if (isGuaranteedToExecute(*UI)) { GuaranteedToExecute = true; Alignment = InstAlignment; } if (!GuaranteedToExecute) - GuaranteedToExecute = isGuaranteedToExecute(*Use); + GuaranteedToExecute = isGuaranteedToExecute(*UI); } else return; // Not a load or store. @@ -815,13 +858,13 @@ void LICM::PromoteAliasSet(AliasSet &AS, // Merge the TBAA tags. if (LoopUses.empty()) { // On the first load/store, just take its TBAA tag. - TBAATag = Use->getMetadata(LLVMContext::MD_tbaa); + TBAATag = UI->getMetadata(LLVMContext::MD_tbaa); } else if (TBAATag) { TBAATag = MDNode::getMostGenericTBAA(TBAATag, - Use->getMetadata(LLVMContext::MD_tbaa)); + UI->getMetadata(LLVMContext::MD_tbaa)); } - - LoopUses.push_back(Use); + + LoopUses.push_back(UI); } } @@ -853,7 +896,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, SmallVector<PHINode*, 16> NewPHIs; SSAUpdater SSA(&NewPHIs); LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, - InsertPts, *CurAST, DL, Alignment, TBAATag); + InsertPts, PIC, *CurAST, *LI, DL, Alignment, TBAATag); // Set up the preheader to have a definition of the value. It is the live-out // value from the preheader that uses in the loop will use. diff --git a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp new file mode 100644 index 0000000..846aa70 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp @@ -0,0 +1,268 @@ +//===- LoadCombine.cpp - Combine Adjacent Loads ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This transformation combines adjacent loads. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetFolder.h" +#include "llvm/Pass.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "load-combine" + +STATISTIC(NumLoadsAnalyzed, "Number of loads analyzed for combining"); +STATISTIC(NumLoadsCombined, "Number of loads combined"); + +namespace { +struct PointerOffsetPair { + Value *Pointer; + uint64_t Offset; +}; + +struct LoadPOPPair { + LoadPOPPair(LoadInst *L, PointerOffsetPair P, unsigned O) + : Load(L), POP(P), InsertOrder(O) {} + LoadPOPPair() {} + LoadInst *Load; + PointerOffsetPair POP; + /// \brief The new load needs to be created before the first load in IR order. + unsigned InsertOrder; +}; + +class LoadCombine : public BasicBlockPass { + LLVMContext *C; + const DataLayout *DL; + +public: + LoadCombine() + : BasicBlockPass(ID), + C(nullptr), DL(nullptr) { + initializeSROAPass(*PassRegistry::getPassRegistry()); + } + bool doInitialization(Function &) override; + bool runOnBasicBlock(BasicBlock &BB) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + + const char *getPassName() const override { return "LoadCombine"; } + static char ID; + + typedef IRBuilder<true, TargetFolder> BuilderTy; + +private: + BuilderTy *Builder; + + PointerOffsetPair getPointerOffsetPair(LoadInst &); + bool combineLoads(DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &); + bool aggregateLoads(SmallVectorImpl<LoadPOPPair> &); + bool combineLoads(SmallVectorImpl<LoadPOPPair> &); +}; +} + +bool LoadCombine::doInitialization(Function &F) { + DEBUG(dbgs() << "LoadCombine function: " << F.getName() << "\n"); + C = &F.getContext(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + if (!DLP) { + DEBUG(dbgs() << " Skipping LoadCombine -- no target data!\n"); + return false; + } + DL = &DLP->getDataLayout(); + return true; +} + +PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) { + PointerOffsetPair POP; + POP.Pointer = LI.getPointerOperand(); + POP.Offset = 0; + while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) { + if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) { + unsigned BitWidth = DL->getPointerTypeSizeInBits(GEP->getType()); + APInt Offset(BitWidth, 0); + if (GEP->accumulateConstantOffset(*DL, Offset)) + POP.Offset += Offset.getZExtValue(); + else + // Can't handle GEPs with variable indices. + return POP; + POP.Pointer = GEP->getPointerOperand(); + } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer)) + POP.Pointer = BC->getOperand(0); + } + return POP; +} + +bool LoadCombine::combineLoads( + DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &LoadMap) { + bool Combined = false; + for (auto &Loads : LoadMap) { + if (Loads.second.size() < 2) + continue; + std::sort(Loads.second.begin(), Loads.second.end(), + [](const LoadPOPPair &A, const LoadPOPPair &B) { + return A.POP.Offset < B.POP.Offset; + }); + if (aggregateLoads(Loads.second)) + Combined = true; + } + return Combined; +} + +/// \brief Try to aggregate loads from a sorted list of loads to be combined. +/// +/// It is guaranteed that no writes occur between any of the loads. All loads +/// have the same base pointer. There are at least two loads. +bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) { + assert(Loads.size() >= 2 && "Insufficient loads!"); + LoadInst *BaseLoad = nullptr; + SmallVector<LoadPOPPair, 8> AggregateLoads; + bool Combined = false; + uint64_t PrevOffset = -1ull; + uint64_t PrevSize = 0; + for (auto &L : Loads) { + if (PrevOffset == -1ull) { + BaseLoad = L.Load; + PrevOffset = L.POP.Offset; + PrevSize = DL->getTypeStoreSize(L.Load->getType()); + AggregateLoads.push_back(L); + continue; + } + if (L.Load->getAlignment() > BaseLoad->getAlignment()) + continue; + if (L.POP.Offset > PrevOffset + PrevSize) { + // No other load will be combinable + if (combineLoads(AggregateLoads)) + Combined = true; + AggregateLoads.clear(); + PrevOffset = -1; + continue; + } + if (L.POP.Offset != PrevOffset + PrevSize) + // This load is offset less than the size of the last load. + // FIXME: We may want to handle this case. + continue; + PrevOffset = L.POP.Offset; + PrevSize = DL->getTypeStoreSize(L.Load->getType()); + AggregateLoads.push_back(L); + } + if (combineLoads(AggregateLoads)) + Combined = true; + return Combined; +} + +/// \brief Given a list of combinable load. Combine the maximum number of them. +bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) { + // Remove loads from the end while the size is not a power of 2. + unsigned TotalSize = 0; + for (const auto &L : Loads) + TotalSize += L.Load->getType()->getPrimitiveSizeInBits(); + while (TotalSize != 0 && !isPowerOf2_32(TotalSize)) + TotalSize -= Loads.pop_back_val().Load->getType()->getPrimitiveSizeInBits(); + if (Loads.size() < 2) + return false; + + DEBUG({ + dbgs() << "***** Combining Loads ******\n"; + for (const auto &L : Loads) { + dbgs() << L.POP.Offset << ": " << *L.Load << "\n"; + } + }); + + // Find first load. This is where we put the new load. + LoadPOPPair FirstLP; + FirstLP.InsertOrder = -1u; + for (const auto &L : Loads) + if (L.InsertOrder < FirstLP.InsertOrder) + FirstLP = L; + + unsigned AddressSpace = + FirstLP.POP.Pointer->getType()->getPointerAddressSpace(); + + Builder->SetInsertPoint(FirstLP.Load); + Value *Ptr = Builder->CreateConstGEP1_64( + Builder->CreatePointerCast(Loads[0].POP.Pointer, + Builder->getInt8PtrTy(AddressSpace)), + Loads[0].POP.Offset); + LoadInst *NewLoad = new LoadInst( + Builder->CreatePointerCast( + Ptr, PointerType::get(IntegerType::get(Ptr->getContext(), TotalSize), + Ptr->getType()->getPointerAddressSpace())), + Twine(Loads[0].Load->getName()) + ".combined", false, + Loads[0].Load->getAlignment(), FirstLP.Load); + + for (const auto &L : Loads) { + Builder->SetInsertPoint(L.Load); + Value *V = Builder->CreateExtractInteger( + *DL, NewLoad, cast<IntegerType>(L.Load->getType()), + L.POP.Offset - Loads[0].POP.Offset, "combine.extract"); + L.Load->replaceAllUsesWith(V); + } + + NumLoadsCombined = NumLoadsCombined + Loads.size(); + return true; +} + +bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { + if (skipOptnoneFunction(BB) || !DL) + return false; + + IRBuilder<true, TargetFolder> + TheBuilder(BB.getContext(), TargetFolder(DL)); + Builder = &TheBuilder; + + DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap; + + bool Combined = false; + unsigned Index = 0; + for (auto &I : BB) { + if (I.mayWriteToMemory() || I.mayThrow()) { + if (combineLoads(LoadMap)) + Combined = true; + LoadMap.clear(); + continue; + } + LoadInst *LI = dyn_cast<LoadInst>(&I); + if (!LI) + continue; + ++NumLoadsAnalyzed; + if (!LI->isSimple() || !LI->getType()->isIntegerTy()) + continue; + auto POP = getPointerOffsetPair(*LI); + if (!POP.Pointer) + continue; + LoadMap[POP.Pointer].push_back(LoadPOPPair(LI, POP, Index++)); + } + if (combineLoads(LoadMap)) + Combined = true; + return Combined; +} + +void LoadCombine::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); +} + +char LoadCombine::ID = 0; + +BasicBlockPass *llvm::createLoadCombinePass() { + return new LoadCombine(); +} + +INITIALIZE_PASS(LoadCombine, "load-combine", "Combine Adjacent Loads", false, + false) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index 9e39d2e..5ab686a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -14,15 +14,16 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-delete" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/IR/Dominators.h" using namespace llvm; +#define DEBUG_TYPE "loop-delete" + STATISTIC(NumDeleted, "Number of loops deleted"); namespace { @@ -34,17 +35,17 @@ namespace { } // Possibly eliminate loop L if it is dead. - bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DominatorTree>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfo>(); AU.addRequired<ScalarEvolution>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreserved<ScalarEvolution>(); - AU.addPreserved<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<LoopInfo>(); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); @@ -61,7 +62,7 @@ namespace { char LoopDeletion::ID = 0; INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion", "Delete dead loops", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) @@ -130,6 +131,9 @@ bool LoopDeletion::isLoopDead(Loop *L, /// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA /// in order to make various safety checks work. bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) + return false; + // We can only remove the loop if there is a preheader that we can // branch from after removing it. BasicBlock *preheader = L->getLoopPreheader(); @@ -202,7 +206,7 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { // Update the dominator tree and remove the instructions and blocks that will // be deleted from the reference counting scheme. - DominatorTree &DT = getAnalysis<DominatorTree>(); + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SmallVector<DomTreeNode*, 8> ChildNodes; for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); LI != LE; ++LI) { diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 952b76b..a12f5a7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -41,7 +41,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-idiom" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -51,6 +50,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" @@ -60,6 +60,8 @@ #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "loop-idiom" + STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); @@ -78,9 +80,6 @@ namespace { return dyn_cast<BranchInst>(BB->getTerminator()); } - /// Return the condition of the branch terminating the given basic block. - static Value *getBrCondtion(BasicBlock *); - /// Derive the precondition block (i.e the block that guards the loop /// preheader) from the given preheader. static BasicBlock *getPrecondBb(BasicBlock *PreHead); @@ -108,22 +107,22 @@ namespace { bool preliminaryScreen(); /// Check if the given conditional branch is based on the comparison - /// beween a variable and zero, and if the variable is non-zero, the - /// control yeilds to the loop entry. If the branch matches the behavior, + /// between a variable and zero, and if the variable is non-zero, the + /// control yields to the loop entry. If the branch matches the behavior, /// the variable involved in the comparion is returned. This function will /// be called to see if the precondition and postcondition of the loop /// are in desirable form. - Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const; + Value *matchCondition(BranchInst *Br, BasicBlock *NonZeroTarget) const; /// Return true iff the idiom is detected in the loop. and 1) \p CntInst - /// is set to the instruction counting the pupulation bit. 2) \p CntPhi + /// is set to the instruction counting the population bit. 2) \p CntPhi /// is set to the corresponding phi node. 3) \p Var is set to the value /// whose population bits are being counted. bool detectIdiom (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const; /// Insert ctpop intrinsic function and some obviously dead instructions. - void transform (Instruction *CntInst, PHINode *CntPhi, Value *Var); + void transform(Instruction *CntInst, PHINode *CntPhi, Value *Var); /// Create llvm.ctpop.* intrinsic function. CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL); @@ -131,7 +130,7 @@ namespace { class LoopIdiomRecognize : public LoopPass { Loop *CurLoop; - const DataLayout *TD; + const DataLayout *DL; DominatorTree *DT; ScalarEvolution *SE; TargetLibraryInfo *TLI; @@ -140,10 +139,10 @@ namespace { static char ID; explicit LoopIdiomRecognize() : LoopPass(ID) { initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); - TD = 0; DT = 0; SE = 0; TLI = 0; TTI = 0; + DL = nullptr; DT = nullptr; SE = nullptr; TLI = nullptr; TTI = nullptr; } - bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, SmallVectorImpl<BasicBlock*> &ExitBlocks); @@ -163,7 +162,7 @@ namespace { /// This transformation requires natural loop information & requires that /// loop preheaders be inserted into the CFG. /// - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LoopInfo>(); AU.addPreserved<LoopInfo>(); AU.addRequiredID(LoopSimplifyID); @@ -174,18 +173,23 @@ namespace { AU.addPreserved<AliasAnalysis>(); AU.addRequired<ScalarEvolution>(); AU.addPreserved<ScalarEvolution>(); - AU.addPreserved<DominatorTree>(); - AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetLibraryInfo>(); AU.addRequired<TargetTransformInfo>(); } const DataLayout *getDataLayout() { - return TD ? TD : TD=getAnalysisIfAvailable<DataLayout>(); + if (DL) + return DL; + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; + return DL; } DominatorTree *getDominatorTree() { - return DT ? DT : (DT=&getAnalysis<DominatorTree>()); + return DT ? DT + : (DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree()); } ScalarEvolution *getScalarEvolution() { @@ -212,7 +216,7 @@ char LoopIdiomRecognize::ID = 0; INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfo) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) @@ -244,7 +248,7 @@ static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE, for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { Value *Op = DeadInst->getOperand(op); - DeadInst->setOperand(op, 0); + DeadInst->setOperand(op, nullptr); // If this operand just became dead, add it to the NowDeadInsts list. if (!Op->use_empty()) continue; @@ -286,17 +290,12 @@ bool LIRUtil::isAlmostEmpty(BasicBlock *BB) { return false; } -Value *LIRUtil::getBrCondtion(BasicBlock *BB) { - BranchInst *Br = getBranch(BB); - return Br ? Br->getCondition() : 0; -} - BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) { if (BasicBlock *BB = PreHead->getSinglePredecessor()) { BranchInst *Br = getBranch(BB); - return Br && Br->isConditional() ? BB : 0; + return Br && Br->isConditional() ? BB : nullptr; } - return 0; + return nullptr; } //===----------------------------------------------------------------------===// @@ -306,7 +305,7 @@ BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) { //===----------------------------------------------------------------------===// NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR): - LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(0) { + LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(nullptr) { } bool NclPopcountRecognize::preliminaryScreen() { @@ -343,25 +342,25 @@ bool NclPopcountRecognize::preliminaryScreen() { return true; } -Value *NclPopcountRecognize::matchCondition (BranchInst *Br, - BasicBlock *LoopEntry) const { +Value *NclPopcountRecognize::matchCondition(BranchInst *Br, + BasicBlock *LoopEntry) const { if (!Br || !Br->isConditional()) - return 0; + return nullptr; ICmpInst *Cond = dyn_cast<ICmpInst>(Br->getCondition()); if (!Cond) - return 0; + return nullptr; ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); if (!CmpZero || !CmpZero->isZero()) - return 0; + return nullptr; ICmpInst::Predicate Pred = Cond->getPredicate(); if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) || (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry)) return Cond->getOperand(0); - return 0; + return nullptr; } bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst, @@ -392,9 +391,9 @@ bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst, Value *VarX1, *VarX0; PHINode *PhiX, *CountPhi; - DefX2 = CountInst = 0; - VarX1 = VarX0 = 0; - PhiX = CountPhi = 0; + DefX2 = CountInst = nullptr; + VarX1 = VarX0 = nullptr; + PhiX = CountPhi = nullptr; LoopEntry = *(CurLoop->block_begin()); // step 1: Check if the loop-back branch is in desirable form. @@ -441,7 +440,7 @@ bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst, // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 { - CountInst = NULL; + CountInst = nullptr; for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(), IterE = LoopEntry->end(); Iter != IterE; Iter++) { Instruction *Inst = Iter; @@ -458,9 +457,8 @@ bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst, // Check if the result of the instruction is live of the loop. bool LiveOutLoop = false; - for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end(); - I != E; I++) { - if ((cast<Instruction>(*I))->getParent() != LoopEntry) { + for (User *U : Inst->users()) { + if ((cast<Instruction>(U))->getParent() != LoopEntry) { LiveOutLoop = true; break; } } @@ -519,7 +517,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst, // TripCnt is exactly the number of iterations the loop has TripCnt = NewCount; - // If the popoulation counter's initial value is not zero, insert Add Inst. + // If the population counter's initial value is not zero, insert Add Inst. Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); if (!InitConst || !InitConst->isZero()) { @@ -596,11 +594,9 @@ void NclPopcountRecognize::transform(Instruction *CntInst, // __builtin_ctpop(). { SmallVector<Value *, 4> CntUses; - for (Value::use_iterator I = CntInst->use_begin(), E = CntInst->use_end(); - I != E; I++) { - if (cast<Instruction>(*I)->getParent() != Body) - CntUses.push_back(*I); - } + for (User *U : CntInst->users()) + if (cast<Instruction>(U)->getParent() != Body) + CntUses.push_back(U); for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) { (cast<Instruction>(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount); } @@ -705,6 +701,9 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() { } bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) + return false; + CurLoop = L; // If the loop could not be converted to canonical form, it must have an @@ -746,7 +745,7 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, // If processing the store invalidated our iterator, start over from the // top of the block. - if (InstPtr == 0) + if (!InstPtr) I = BB->begin(); continue; } @@ -759,7 +758,7 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, // If processing the memset invalidated our iterator, start over from the // top of the block. - if (InstPtr == 0) + if (!InstPtr) I = BB->begin(); continue; } @@ -777,7 +776,7 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { Value *StorePtr = SI->getPointerOperand(); // Reject stores that are so large that they overflow an unsigned. - uint64_t SizeInBits = TD->getTypeSizeInBits(StoredVal->getType()); + uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) return false; @@ -786,7 +785,7 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { // random store we can't handle. const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); - if (StoreEv == 0 || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) + if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) return false; // Check to see if the stride matches the size of the store. If so, then we @@ -794,7 +793,7 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { unsigned StoreSize = (unsigned)SizeInBits >> 3; const SCEVConstant *Stride = dyn_cast<SCEVConstant>(StoreEv->getOperand(1)); - if (Stride == 0 || StoreSize != Stride->getValue()->getValue()) { + if (!Stride || StoreSize != Stride->getValue()->getValue()) { // TODO: Could also handle negative stride here someday, that will require // the validity check in mayLoopAccessLocation to be updated though. // Enable this to print exact negative strides. @@ -843,7 +842,7 @@ processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { // loop, which indicates a strided store. If we have something else, it's a // random store we can't handle. const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer)); - if (Ev == 0 || Ev->getLoop() != CurLoop || !Ev->isAffine()) + if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine()) return false; // Reject memsets that are so large that they overflow an unsigned. @@ -857,7 +856,7 @@ processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { // TODO: Could also handle negative stride here someday, that will require the // validity check in mayLoopAccessLocation to be updated though. - if (Stride == 0 || MSI->getLength() != Stride->getValue()) + if (!Stride || MSI->getLength() != Stride->getValue()) return false; return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, @@ -905,28 +904,28 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, /// /// Note that we don't ever attempt to use memset_pattern8 or 4, because these /// just replicate their input array and then pass on to memset_pattern16. -static Constant *getMemSetPatternValue(Value *V, const DataLayout &TD) { +static Constant *getMemSetPatternValue(Value *V, const DataLayout &DL) { // If the value isn't a constant, we can't promote it to being in a constant // array. We could theoretically do a store to an alloca or something, but // that doesn't seem worthwhile. Constant *C = dyn_cast<Constant>(V); - if (C == 0) return 0; + if (!C) return nullptr; // Only handle simple values that are a power of two bytes in size. - uint64_t Size = TD.getTypeSizeInBits(V->getType()); + uint64_t Size = DL.getTypeSizeInBits(V->getType()); if (Size == 0 || (Size & 7) || (Size & (Size-1))) - return 0; + return nullptr; // Don't care enough about darwin/ppc to implement this. - if (TD.isBigEndian()) - return 0; + if (DL.isBigEndian()) + return nullptr; // Convert to size in bytes. Size /= 8; // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see // if the top and bottom are the same (e.g. for vectors and large integers). - if (Size > 16) return 0; + if (Size > 16) return nullptr; // If the constant is exactly 16 bytes, just use it. if (Size == 16) return C; @@ -951,7 +950,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // are stored. A store of i32 0x01020304 can never be turned into a memset, // but it can be turned into memset_pattern if the target supports it. Value *SplatValue = isBytewiseValue(StoredVal); - Constant *PatternValue = 0; + Constant *PatternValue = nullptr; unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); @@ -962,13 +961,13 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // promote the memset. CurLoop->isLoopInvariant(SplatValue)) { // Keep and use SplatValue. - PatternValue = 0; + PatternValue = nullptr; } else if (DestAS == 0 && TLI->has(LibFunc::memset_pattern16) && - (PatternValue = getMemSetPatternValue(StoredVal, *TD))) { + (PatternValue = getMemSetPatternValue(StoredVal, *DL))) { // Don't create memset_pattern16s with address spaces. // It looks like we can use PatternValue! - SplatValue = 0; + SplatValue = nullptr; } else { // Otherwise, this isn't an idiom we can transform. For example, we can't // do anything with a 3-byte store. @@ -1006,7 +1005,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtr = Builder.getIntPtrTy(TD, DestAS); + Type *IntPtr = Builder.getIntPtrTy(DL, DestAS); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), @@ -1035,7 +1034,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, Int8PtrTy, Int8PtrTy, IntPtr, - (void*)0); + (void*)nullptr); // Otherwise we should form a memset_pattern16. PatternValue is known to be // an constant array of 16-bytes. Plop the value into a mergable global. @@ -1120,7 +1119,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtrTy = Builder.getIntPtrTy(TD, SI->getPointerAddressSpace()); + Type *IntPtrTy = Builder.getIntPtrTy(DL, SI->getPointerAddressSpace()); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1), diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp index a23860a..ab1a939 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -11,21 +11,22 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-instsimplify" #include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/Statistic.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/Analysis/Dominators.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "loop-instsimplify" + STATISTIC(NumSimplified, "Number of redundant instructions simplified"); namespace { @@ -36,9 +37,9 @@ namespace { initializeLoopInstSimplifyPass(*PassRegistry::getPassRegistry()); } - bool runOnLoop(Loop*, LPPassManager&); + bool runOnLoop(Loop*, LPPassManager&) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<LoopInfo>(); AU.addRequiredID(LoopSimplifyID); @@ -54,7 +55,7 @@ char LoopInstSimplify::ID = 0; INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify", "Simplify instructions in loops", false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopInstSimplify, "loop-instsimplify", @@ -65,9 +66,15 @@ Pass *llvm::createLoopInstSimplifyPass() { } bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { - DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>(); + if (skipOptnoneFunction(L)) + return false; + + DominatorTreeWrapperPass *DTWP = + getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; LoopInfo *LI = &getAnalysis<LoopInfo>(); - const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); SmallVector<BasicBlock*, 8> ExitBlocks; @@ -109,19 +116,26 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // Don't bother simplifying unused instructions. if (!I->use_empty()) { - Value *V = SimplifyInstruction(I, TD, TLI, DT); + Value *V = SimplifyInstruction(I, DL, TLI, DT); if (V && LI->replacementPreservesLCSSAForm(I, V)) { // Mark all uses for resimplification next time round the loop. - for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); - UI != UE; ++UI) - Next->insert(cast<Instruction>(*UI)); + for (User *U : I->users()) + Next->insert(cast<Instruction>(U)); I->replaceAllUsesWith(V); LocalChanged = true; ++NumSimplified; } } - LocalChanged |= RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + bool res = RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + if (res) { + // RecursivelyDeleteTriviallyDeadInstruction can remove + // more than one instruction, so simply incrementing the + // iterator does not work. When instructions get deleted + // re-iterate instead. + BI = BB->begin(); BE = BB->end(); + LocalChanged |= res; + } if (IsSubloopHeader && !isa<PHINode>(I)) break; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp index 643bc78..b6fbb16 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -11,11 +11,10 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-reroll" #include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/LoopPass.h" @@ -24,6 +23,7 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -35,6 +35,8 @@ using namespace llvm; +#define DEBUG_TYPE "loop-reroll" + STATISTIC(NumRerolledLoops, "Number of rerolled loops"); static cl::opt<unsigned> @@ -124,14 +126,14 @@ namespace { initializeLoopRerollPass(*PassRegistry::getPassRegistry()); } - bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AliasAnalysis>(); AU.addRequired<LoopInfo>(); AU.addPreserved<LoopInfo>(); - AU.addRequired<DominatorTree>(); - AU.addPreserved<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<ScalarEvolution>(); AU.addRequired<TargetLibraryInfo>(); } @@ -140,7 +142,7 @@ protected: AliasAnalysis *AA; LoopInfo *LI; ScalarEvolution *SE; - DataLayout *DL; + const DataLayout *DL; TargetLibraryInfo *TLI; DominatorTree *DT; @@ -189,12 +191,12 @@ protected: iterator begin() { assert(Valid && "Using invalid reduction"); - return llvm::next(Instructions.begin()); + return std::next(Instructions.begin()); } const_iterator begin() const { assert(Valid && "Using invalid reduction"); - return llvm::next(Instructions.begin()); + return std::next(Instructions.begin()); } iterator end() { return Instructions.end(); } @@ -340,7 +342,7 @@ char LoopReroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_DEPENDENCY(LoopInfo) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) @@ -353,12 +355,9 @@ Pass *llvm::createLoopRerollPass() { // This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in // non-loop blocks to be outside the loop. static bool hasUsesOutsideLoop(Instruction *I, Loop *L) { - for (Value::use_iterator UI = I->use_begin(), - UIE = I->use_end(); UI != UIE; ++UI) { - Instruction *User = cast<Instruction>(*UI); - if (!L->contains(User)) + for (User *U : I->users()) + if (!L->contains(cast<Instruction>(U))) return true; - } return false; } @@ -408,7 +407,7 @@ void LoopReroll::SimpleLoopReduction::add(Loop *L) { Instruction *C = Instructions.front(); do { - C = cast<Instruction>(*C->use_begin()); + C = cast<Instruction>(*C->user_begin()); if (C->hasOneUse()) { if (!C->isBinaryOp()) return; @@ -423,17 +422,15 @@ void LoopReroll::SimpleLoopReduction::add(Loop *L) { if (Instructions.size() < 2 || !C->isSameOperationAs(Instructions.back()) || - C->use_begin() == C->use_end()) + C->use_empty()) return; // C is now the (potential) last instruction in the reduction chain. - for (Value::use_iterator UI = C->use_begin(), UIE = C->use_end(); - UI != UIE; ++UI) { + for (User *U : C->users()) // The only in-loop user can be the initial PHI. - if (L->contains(cast<Instruction>(*UI))) - if (cast<Instruction>(*UI ) != Instructions.front()) + if (L->contains(cast<Instruction>(U))) + if (cast<Instruction>(U) != Instructions.front()) return; - } Instructions.push_back(C); Valid = true; @@ -483,12 +480,11 @@ void LoopReroll::collectInLoopUserSet(Loop *L, continue; if (!Final.count(I)) - for (Value::use_iterator UI = I->use_begin(), - UIE = I->use_end(); UI != UIE; ++UI) { - Instruction *User = cast<Instruction>(*UI); + for (Use &U : I->uses()) { + Instruction *User = cast<Instruction>(U.getUser()); if (PHINode *PN = dyn_cast<PHINode>(User)) { // Ignore "wrap-around" uses to PHIs of this loop's header. - if (PN->getIncomingBlock(UI) == L->getHeader()) + if (PN->getIncomingBlock(U) == L->getHeader()) continue; } @@ -559,8 +555,8 @@ bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale, if (RealIV->getNumUses() != 2) return false; const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(RealIV)); - Instruction *User1 = cast<Instruction>(*RealIV->use_begin()), - *User2 = cast<Instruction>(*llvm::next(RealIV->use_begin())); + Instruction *User1 = cast<Instruction>(*RealIV->user_begin()), + *User2 = cast<Instruction>(*std::next(RealIV->user_begin())); if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType())) return false; const SCEVAddRecExpr *User1SCEV = @@ -616,26 +612,25 @@ bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, SmallVector<SmallInstructionVector, 32> &Roots, SmallInstructionSet &AllRoots, SmallInstructionVector &LoopIncs) { - for (Value::use_iterator UI = IV->use_begin(), - UIE = IV->use_end(); UI != UIE; ++UI) { - Instruction *User = cast<Instruction>(*UI); - if (!SE->isSCEVable(User->getType())) + for (User *U : IV->users()) { + Instruction *UI = cast<Instruction>(U); + if (!SE->isSCEVable(UI->getType())) continue; - if (User->getType() != IV->getType()) + if (UI->getType() != IV->getType()) continue; - if (!L->contains(User)) + if (!L->contains(UI)) continue; - if (hasUsesOutsideLoop(User, L)) + if (hasUsesOutsideLoop(UI, L)) continue; if (const SCEVConstant *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV( - SE->getSCEV(User), SE->getSCEV(IV)))) { + SE->getSCEV(UI), SE->getSCEV(IV)))) { uint64_t Idx = Diff->getValue()->getValue().getZExtValue(); if (Idx > 0 && Idx < Scale) { - Roots[Idx-1].push_back(User); - AllRoots.insert(User); + Roots[Idx-1].push_back(UI); + AllRoots.insert(UI); } else if (Idx == Scale && Inc > 1) { - LoopIncs.push_back(User); + LoopIncs.push_back(UI); } } } @@ -719,10 +714,8 @@ void LoopReroll::ReductionTracker::replaceSelected() { // Replace users with the new end-of-chain value. SmallInstructionVector Users; - for (Value::use_iterator UI = - PossibleReds[i].getReducedValue()->use_begin(), - UIE = PossibleReds[i].getReducedValue()->use_end(); UI != UIE; ++UI) - Users.push_back(cast<Instruction>(*UI)); + for (User *U : PossibleReds[i].getReducedValue()->users()) + Users.push_back(cast<Instruction>(U)); for (SmallInstructionVector::iterator J = Users.begin(), JE = Users.end(); J != JE; ++J) @@ -931,8 +924,10 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, // them, and this matching fails. As an exception, we allow the alias // set tracker to handle regular (simple) load/store dependencies. if (FutureSideEffects && - ((!isSimpleLoadStore(J1) && !isSafeToSpeculativelyExecute(J1)) || - (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2)))) { + ((!isSimpleLoadStore(J1) && + !isSafeToSpeculativelyExecute(J1, DL)) || + (!isSimpleLoadStore(J2) && + !isSafeToSpeculativelyExecute(J2, DL)))) { DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << " vs. " << *J2 << " (side effects prevent reordering)\n"); @@ -953,7 +948,7 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, bool InReduction = Reductions.isPairInSame(J1, J2); if (!(InReduction && J1->isAssociative())) { - bool Swapped = false, SomeOpMatched = false;; + bool Swapped = false, SomeOpMatched = false; for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) { Value *Op2 = J2->getOperand(j); @@ -1133,12 +1128,16 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, } bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) + return false; + AA = &getAnalysis<AliasAnalysis>(); LI = &getAnalysis<LoopInfo>(); SE = &getAnalysis<ScalarEvolution>(); TLI = &getAnalysis<TargetLibraryInfo>(); - DL = getAnalysisIfAvailable<DataLayout>(); - DT = &getAnalysis<DominatorTree>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp index 14c5655..2ce5831 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -11,7 +11,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-rotate" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CodeMetrics.h" @@ -20,9 +19,11 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/Support/CFG.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" @@ -30,7 +31,11 @@ #include "llvm/Transforms/Utils/ValueMapper.h" using namespace llvm; -#define MAX_HEADER_SIZE 16 +#define DEBUG_TYPE "loop-rotate" + +static cl::opt<unsigned> +DefaultRotationThreshold("rotation-max-header-size", cl::init(16), cl::Hidden, + cl::desc("The default maximum header size for automatic loop rotation")); STATISTIC(NumRotated, "Number of loops rotated"); namespace { @@ -38,13 +43,17 @@ namespace { class LoopRotate : public LoopPass { public: static char ID; // Pass ID, replacement for typeid - LoopRotate() : LoopPass(ID) { + LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) { initializeLoopRotatePass(*PassRegistry::getPassRegistry()); + if (SpecifiedMaxHeaderSize == -1) + MaxHeaderSize = DefaultRotationThreshold; + else + MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize); } // LCSSA form makes instruction renaming easier. - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addPreserved<DominatorTree>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfo>(); AU.addPreserved<LoopInfo>(); AU.addRequiredID(LoopSimplifyID); @@ -55,11 +64,12 @@ namespace { AU.addRequired<TargetTransformInfo>(); } - bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; bool simplifyLoopLatch(Loop *L); bool rotateLoop(Loop *L, bool SimplifiedLatch); private: + unsigned MaxHeaderSize; LoopInfo *LI; const TargetTransformInfo *TTI; }; @@ -73,11 +83,19 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false) -Pass *llvm::createLoopRotatePass() { return new LoopRotate(); } +Pass *llvm::createLoopRotatePass(int MaxHeaderSize) { + return new LoopRotate(MaxHeaderSize); +} /// Rotate Loop L as many times as possible. Return true if /// the loop is rotated at least once. bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) + return false; + + // Save the loop metadata. + MDNode *LoopMD = L->getLoopID(); + LI = &getAnalysis<LoopInfo>(); TTI = &getAnalysis<TargetTransformInfo>(); @@ -92,6 +110,12 @@ bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) { MadeChange = true; SimplifiedLatch = false; } + + // Restore the loop metadata. + // NB! We presume LoopRotation DOESN'T ADD its own metadata. + if ((MadeChange || SimplifiedLatch) && LoopMD) + L->setLoopID(LoopMD); + return MadeChange; } @@ -130,7 +154,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, for (Value::use_iterator UI = OrigHeaderVal->use_begin(), UE = OrigHeaderVal->use_end(); UI != UE; ) { // Grab the use before incrementing the iterator. - Use &U = UI.getUse(); + Use &U = *UI; // Increment the iterator before removing the use from the list. ++UI; @@ -251,8 +275,9 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) { // Nuke the Latch block. assert(Latch->empty() && "unable to evacuate Latch"); LI->removeBlock(Latch); - if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) - DT->eraseNode(Latch); + if (DominatorTreeWrapperPass *DTWP = + getAnalysisIfAvailable<DominatorTreeWrapperPass>()) + DTWP->getDomTree().eraseNode(Latch); Latch->eraseFromParent(); return true; } @@ -276,7 +301,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { BasicBlock *OrigLatch = L->getLoopLatch(); BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); - if (BI == 0 || BI->isUnconditional()) + if (!BI || BI->isUnconditional()) return false; // If the loop header is not one of the loop exiting blocks then @@ -287,7 +312,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // If the loop latch already contains a branch that leaves the loop then the // loop is already rotated. - if (OrigLatch == 0) + if (!OrigLatch) return false; // Rotate if either the loop latch does *not* exit the loop, or if the loop @@ -301,11 +326,11 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { CodeMetrics Metrics; Metrics.analyzeBasicBlock(OrigHeader, *TTI); if (Metrics.notDuplicatable) { - DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non duplicatable" + DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" << " instructions: "; L->dump()); return false; } - if (Metrics.NumInsts > MAX_HEADER_SIZE) + if (Metrics.NumInsts > MaxHeaderSize) return false; } @@ -314,7 +339,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // If the loop could not be converted to canonical form, it must have an // indirectbr in it, just give up. - if (OrigPreheader == 0) + if (!OrigPreheader) return false; // Anything ScalarEvolution may know about this loop or the PHI nodes @@ -433,23 +458,25 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // The conditional branch can't be folded, handle the general case. // Update DominatorTree to reflect the CFG change we just made. Then split // edges as necessary to preserve LoopSimplify form. - if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) { + if (DominatorTreeWrapperPass *DTWP = + getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { + DominatorTree &DT = DTWP->getDomTree(); // Everything that was dominated by the old loop header is now dominated // by the original loop preheader. Conceptually the header was merged // into the preheader, even though we reuse the actual block as a new // loop latch. - DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); + DomTreeNode *OrigHeaderNode = DT.getNode(OrigHeader); SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(), OrigHeaderNode->end()); - DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader); + DomTreeNode *OrigPreheaderNode = DT.getNode(OrigPreheader); for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) - DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode); + DT.changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode); - assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode); - assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode); + assert(DT.getNode(Exit)->getIDom() == OrigPreheaderNode); + assert(DT.getNode(NewHeader)->getIDom() == OrigPreheaderNode); // Update OrigHeader to be dominated by the new header block. - DT->changeImmediateDominator(OrigHeader, OrigLatch); + DT.changeImmediateDominator(OrigHeader, OrigLatch); } // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and @@ -459,9 +486,24 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { NewPH->setName(NewHeader->getName() + ".lr.ph"); // Preserve canonical loop form, which means that 'Exit' should have only - // one predecessor. - BasicBlock *ExitSplit = SplitCriticalEdge(L->getLoopLatch(), Exit, this); - ExitSplit->moveBefore(Exit); + // one predecessor. Note that Exit could be an exit block for multiple + // nested loops, causing both of the edges to now be critical and need to + // be split. + SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit)); + bool SplitLatchEdge = false; + for (SmallVectorImpl<BasicBlock *>::iterator PI = ExitPreds.begin(), + PE = ExitPreds.end(); + PI != PE; ++PI) { + // We only need to split loop exit edges. + Loop *PredLoop = LI->getLoopFor(*PI); + if (!PredLoop || PredLoop->contains(Exit)) + continue; + SplitLatchEdge |= L->getLoopLatch() == *PI; + BasicBlock *ExitSplit = SplitCriticalEdge(*PI, Exit, this); + ExitSplit->moveBefore(Exit); + } + assert(SplitLatchEdge && + "Despite splitting all preds, failed to split latch exit?"); } else { // We can fold the conditional branch in the preheader, this makes things // simpler. The first step is to remove the extra edge to the Exit block. @@ -471,15 +513,17 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { PHBI->eraseFromParent(); // With our CFG finalized, update DomTree if it is available. - if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) { + if (DominatorTreeWrapperPass *DTWP = + getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { + DominatorTree &DT = DTWP->getDomTree(); // Update OrigHeader to be dominated by the new header block. - DT->changeImmediateDominator(NewHeader, OrigPreheader); - DT->changeImmediateDominator(OrigHeader, OrigLatch); + DT.changeImmediateDominator(NewHeader, OrigPreheader); + DT.changeImmediateDominator(OrigHeader, OrigLatch); // Brute force incremental dominator tree update. Call // findNearestCommonDominator on all CFG predecessors of each child of the // original header. - DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); + DomTreeNode *OrigHeaderNode = DT.getNode(OrigHeader); SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(), OrigHeaderNode->end()); bool Changed; @@ -492,11 +536,11 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { pred_iterator PI = pred_begin(BB); BasicBlock *NearestDom = *PI; for (pred_iterator PE = pred_end(BB); PI != PE; ++PI) - NearestDom = DT->findNearestCommonDominator(NearestDom, *PI); + NearestDom = DT.findNearestCommonDominator(NearestDom, *PI); // Remember if this changes the DomTree. if (Node->getIDom()->getBlock() != NearestDom) { - DT->changeImmediateDominator(BB, NearestDom); + DT.changeImmediateDominator(BB, NearestDom); Changed = true; } } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 6133962..914b56a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -53,31 +53,32 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-reduce" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallBitVector.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/IVUsers.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Assembly/Writer.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; +#define DEBUG_TYPE "loop-reduce" + /// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for /// bail out. This threshold is far beyond the number of users that LSR can /// conceivably solve, so it should not affect generated code, but catches the @@ -237,7 +238,15 @@ struct Formula { int64_t Scale; /// BaseRegs - The list of "base" registers for this use. When this is - /// non-empty, + /// non-empty. The canonical representation of a formula is + /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and + /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty(). + /// #1 enforces that the scaled register is always used when at least two + /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2. + /// #2 enforces that 1 * reg is reg. + /// This invariant can be temporarly broken while building a formula. + /// However, every formula inserted into the LSRInstance must be in canonical + /// form. SmallVector<const SCEV *, 4> BaseRegs; /// ScaledReg - The 'scaled' register for this use. This should be non-null @@ -250,12 +259,18 @@ struct Formula { int64_t UnfoldedOffset; Formula() - : BaseGV(0), BaseOffset(0), HasBaseReg(false), Scale(0), ScaledReg(0), - UnfoldedOffset(0) {} + : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0), + ScaledReg(nullptr), UnfoldedOffset(0) {} void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); - unsigned getNumRegs() const; + bool isCanonical() const; + + void Canonicalize(); + + bool Unscale(); + + size_t getNumRegs() const; Type *getType() const; void DeleteBaseReg(const SCEV *&S); @@ -345,12 +360,58 @@ void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { BaseRegs.push_back(Sum); HasBaseReg = true; } + Canonicalize(); +} + +/// \brief Check whether or not this formula statisfies the canonical +/// representation. +/// \see Formula::BaseRegs. +bool Formula::isCanonical() const { + if (ScaledReg) + return Scale != 1 || !BaseRegs.empty(); + return BaseRegs.size() <= 1; +} + +/// \brief Helper method to morph a formula into its canonical representation. +/// \see Formula::BaseRegs. +/// Every formula having more than one base register, must use the ScaledReg +/// field. Otherwise, we would have to do special cases everywhere in LSR +/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ... +/// On the other hand, 1*reg should be canonicalized into reg. +void Formula::Canonicalize() { + if (isCanonical()) + return; + // So far we did not need this case. This is easy to implement but it is + // useless to maintain dead code. Beside it could hurt compile time. + assert(!BaseRegs.empty() && "1*reg => reg, should not be needed."); + // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg. + ScaledReg = BaseRegs.back(); + BaseRegs.pop_back(); + Scale = 1; + size_t BaseRegsSize = BaseRegs.size(); + size_t Try = 0; + // If ScaledReg is an invariant, try to find a variant expression. + while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg)) + std::swap(ScaledReg, BaseRegs[Try++]); +} + +/// \brief Get rid of the scale in the formula. +/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2. +/// \return true if it was possible to get rid of the scale, false otherwise. +/// \note After this operation the formula may not be in the canonical form. +bool Formula::Unscale() { + if (Scale != 1) + return false; + Scale = 0; + BaseRegs.push_back(ScaledReg); + ScaledReg = nullptr; + return true; } /// getNumRegs - Return the total number of register operands used by this /// formula. This does not include register uses implied by non-constant /// addrec strides. -unsigned Formula::getNumRegs() const { +size_t Formula::getNumRegs() const { return !!ScaledReg + BaseRegs.size(); } @@ -360,7 +421,7 @@ Type *Formula::getType() const { return !BaseRegs.empty() ? BaseRegs.front()->getType() : ScaledReg ? ScaledReg->getType() : BaseGV ? BaseGV->getType() : - 0; + nullptr; } /// DeleteBaseReg - Delete the given base reg from the BaseRegs list. @@ -394,7 +455,7 @@ void Formula::print(raw_ostream &OS) const { bool First = true; if (BaseGV) { if (!First) OS << " + "; else First = false; - WriteAsOperand(OS, BaseGV, /*PrintType=*/false); + BaseGV->printAsOperand(OS, /*PrintType=*/false); } if (BaseOffset != 0) { if (!First) OS << " + "; else First = false; @@ -422,7 +483,7 @@ void Formula::print(raw_ostream &OS) const { OS << ')'; } if (UnfoldedOffset != 0) { - if (!First) OS << " + "; else First = false; + if (!First) OS << " + "; OS << "imm(" << UnfoldedOffset << ')'; } } @@ -487,11 +548,11 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, // Check for a division of a constant by a constant. if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) { if (!RC) - return 0; + return nullptr; const APInt &LA = C->getValue()->getValue(); const APInt &RA = RC->getValue()->getValue(); if (LA.srem(RA) != 0) - return 0; + return nullptr; return SE.getConstant(LA.sdiv(RA)); } @@ -500,16 +561,16 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) { const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE, IgnoreSignificantBits); - if (!Step) return 0; + if (!Step) return nullptr; const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE, IgnoreSignificantBits); - if (!Start) return 0; + if (!Start) return nullptr; // FlagNW is independent of the start value, step direction, and is // preserved with smaller magnitude steps. // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap); } - return 0; + return nullptr; } // Distribute the sdiv over add operands, if the add doesn't overflow. @@ -520,12 +581,12 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, I != E; ++I) { const SCEV *Op = getExactSDiv(*I, RHS, SE, IgnoreSignificantBits); - if (!Op) return 0; + if (!Op) return nullptr; Ops.push_back(Op); } return SE.getAddExpr(Ops); } - return 0; + return nullptr; } // Check for a multiply operand that we can pull RHS out of. @@ -544,13 +605,13 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, } Ops.push_back(S); } - return Found ? SE.getMulExpr(Ops) : 0; + return Found ? SE.getMulExpr(Ops) : nullptr; } - return 0; + return nullptr; } // Otherwise we don't know. - return 0; + return nullptr; } /// ExtractImmediate - If S involves the addition of a constant integer value, @@ -604,7 +665,7 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { SCEV::FlagAnyWrap); return Result; } - return 0; + return nullptr; } /// isAddressUse - Returns true if the specified instruction is using the @@ -723,13 +784,12 @@ static bool isHighCostExpansion(const SCEV *S, // multiplication already generates this expression. if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) { Value *UVal = U->getValue(); - for (Value::use_iterator UI = UVal->use_begin(), UE = UVal->use_end(); - UI != UE; ++UI) { + for (User *UR : UVal->users()) { // If U is a constant, it may be used by a ConstantExpr. - Instruction *User = dyn_cast<Instruction>(*UI); - if (User && User->getOpcode() == Instruction::Mul - && SE.isSCEVable(User->getType())) { - return SE.getSCEV(User) == Mul; + Instruction *UI = dyn_cast<Instruction>(UR); + if (UI && UI->getOpcode() == Instruction::Mul && + SE.isSCEVable(UI->getType())) { + return SE.getSCEV(UI) == Mul; } } } @@ -756,12 +816,12 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { Value *V = DeadInsts.pop_back_val(); Instruction *I = dyn_cast_or_null<Instruction>(V); - if (I == 0 || !isInstructionTriviallyDead(I)) + if (!I || !isInstructionTriviallyDead(I)) continue; for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) if (Instruction *U = dyn_cast<Instruction>(*OI)) { - *OI = 0; + *OI = nullptr; if (U->use_empty()) DeadInsts.push_back(U); } @@ -776,9 +836,18 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { namespace { class LSRUse; } -// Check if it is legal to fold 2 base registers. -static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU, - const Formula &F); + +/// \brief Check if the addressing mode defined by \p F is completely +/// folded in \p LU at isel time. +/// This includes address-mode folding and special icmp tricks. +/// This function returns true if \p LU can accommodate what \p F +/// defines and up to 1 base + 1 scaled + offset. +/// In other words, if \p F has several base registers, this function may +/// still return true. Therefore, users still need to account for +/// additional base registers and/or unfolded offsets to derive an +/// accurate cost model. +static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, + const LSRUse &LU, const Formula &F); // Get the cost of the scaling factor used in F for LU. static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F); @@ -804,7 +873,7 @@ public: bool operator<(const Cost &Other) const; - void Loose(); + void Lose(); #ifndef NDEBUG // Once any of the metrics loses, they must all remain losers. @@ -829,7 +898,7 @@ public: const SmallVectorImpl<int64_t> &Offsets, ScalarEvolution &SE, DominatorTree &DT, const LSRUse &LU, - SmallPtrSet<const SCEV *, 16> *LoserRegs = 0); + SmallPtrSet<const SCEV *, 16> *LoserRegs = nullptr); void print(raw_ostream &OS) const; void dump() const; @@ -864,7 +933,7 @@ void Cost::RateRegister(const SCEV *Reg, return; // Otherwise, do not consider this formula at all. - Loose(); + Lose(); return; } AddRecCost += 1; /// TODO: This should be a function of the stride. @@ -903,7 +972,7 @@ void Cost::RatePrimaryRegister(const SCEV *Reg, ScalarEvolution &SE, DominatorTree &DT, SmallPtrSet<const SCEV *, 16> *LoserRegs) { if (LoserRegs && LoserRegs->count(Reg)) { - Loose(); + Lose(); return; } if (Regs.insert(Reg)) { @@ -922,10 +991,11 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, ScalarEvolution &SE, DominatorTree &DT, const LSRUse &LU, SmallPtrSet<const SCEV *, 16> *LoserRegs) { + assert(F.isCanonical() && "Cost is accurate only for canonical formula"); // Tally up the registers. if (const SCEV *ScaledReg = F.ScaledReg) { if (VisitedRegs.count(ScaledReg)) { - Loose(); + Lose(); return; } RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs); @@ -936,7 +1006,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, E = F.BaseRegs.end(); I != E; ++I) { const SCEV *BaseReg = *I; if (VisitedRegs.count(BaseReg)) { - Loose(); + Lose(); return; } RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs); @@ -945,11 +1015,13 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, } // Determine how many (unfolded) adds we'll need inside the loop. - size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0); + size_t NumBaseParts = F.getNumRegs(); if (NumBaseParts > 1) // Do not count the base and a possible second register if the target // allows to fold 2 registers. - NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F)); + NumBaseAdds += + NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F))); + NumBaseAdds += (F.UnfoldedOffset != 0); // Accumulate non-free scaling amounts. ScaleCost += getScalingFactorCost(TTI, LU, F); @@ -967,8 +1039,8 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, assert(isValid() && "invalid cost"); } -/// Loose - Set this cost to a losing value. -void Cost::Loose() { +/// Lose - Set this cost to a losing value. +void Cost::Lose() { NumRegs = ~0u; AddRecCost = ~0u; NumIVMuls = ~0u; @@ -980,21 +1052,11 @@ void Cost::Loose() { /// operator< - Choose the lower cost. bool Cost::operator<(const Cost &Other) const { - if (NumRegs != Other.NumRegs) - return NumRegs < Other.NumRegs; - if (AddRecCost != Other.AddRecCost) - return AddRecCost < Other.AddRecCost; - if (NumIVMuls != Other.NumIVMuls) - return NumIVMuls < Other.NumIVMuls; - if (NumBaseAdds != Other.NumBaseAdds) - return NumBaseAdds < Other.NumBaseAdds; - if (ScaleCost != Other.ScaleCost) - return ScaleCost < Other.ScaleCost; - if (ImmCost != Other.ImmCost) - return ImmCost < Other.ImmCost; - if (SetupCost != Other.SetupCost) - return SetupCost < Other.SetupCost; - return false; + return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost, + ImmCost, SetupCost) < + std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls, + Other.NumBaseAdds, Other.ScaleCost, Other.ImmCost, + Other.SetupCost); } void Cost::print(raw_ostream &OS) const { @@ -1058,7 +1120,8 @@ struct LSRFixup { } LSRFixup::LSRFixup() - : UserInst(0), OperandValToReplace(0), LUIdx(~size_t(0)), Offset(0) {} + : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)), + Offset(0) {} /// isUseFullyOutsideLoop - Test whether this fixup always uses its /// value outside of the given loop. @@ -1080,19 +1143,19 @@ void LSRFixup::print(raw_ostream &OS) const { // Store is common and interesting enough to be worth special-casing. if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) { OS << "store "; - WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false); + Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false); } else if (UserInst->getType()->isVoidTy()) OS << UserInst->getOpcodeName(); else - WriteAsOperand(OS, UserInst, /*PrintType=*/false); + UserInst->printAsOperand(OS, /*PrintType=*/false); OS << ", OperandValToReplace="; - WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false); + OperandValToReplace->printAsOperand(OS, /*PrintType=*/false); for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(), E = PostIncLoops.end(); I != E; ++I) { OS << ", PostIncLoop="; - WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false); + (*I)->getHeader()->printAsOperand(OS, /*PrintType=*/false); } if (LUIdx != ~size_t(0)) @@ -1126,11 +1189,7 @@ struct UniquifierDenseMapInfo { } static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) { - unsigned Result = 0; - for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(), - E = V.end(); I != E; ++I) - Result ^= DenseMapInfo<const SCEV *>::getHashValue(*I); - return Result; + return static_cast<unsigned>(hash_combine_range(V.begin(), V.end())); } static bool isEqual(const SmallVector<const SCEV *, 4> &LHS, @@ -1158,6 +1217,8 @@ public: // TODO: Add a generic icmp too? }; + typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair; + KindType Kind; Type *AccessTy; @@ -1196,7 +1257,7 @@ public: MaxOffset(INT64_MIN), AllFixupsOutsideLoop(true), RigidFormula(false), - WidestFixupType(0) {} + WidestFixupType(nullptr) {} bool HasFormulaWithSameRegs(const Formula &F) const; bool InsertFormula(const Formula &F); @@ -1221,7 +1282,10 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { /// InsertFormula - If the given formula has not yet been inserted, add it to /// the list, and return true. Return false otherwise. +/// The formula must be in canonical form. bool LSRUse::InsertFormula(const Formula &F) { + assert(F.isCanonical() && "Invalid canonical representation"); + if (!Formulae.empty() && RigidFormula) return false; @@ -1247,6 +1311,8 @@ bool LSRUse::InsertFormula(const Formula &F) { // Record registers now being used by this use. Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); + if (F.ScaledReg) + Regs.insert(F.ScaledReg); return true; } @@ -1295,7 +1361,7 @@ void LSRUse::print(raw_ostream &OS) const { for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), E = Offsets.end(); I != E; ++I) { OS << *I; - if (llvm::next(I) != E) + if (std::next(I) != E) OS << ','; } OS << '}'; @@ -1313,12 +1379,10 @@ void LSRUse::dump() const { } #endif -/// isLegalUse - Test whether the use described by AM is "legal", meaning it can -/// be completely folded into the user instruction at isel time. This includes -/// address-mode folding and special icmp tricks. -static bool isLegalUse(const TargetTransformInfo &TTI, LSRUse::KindType Kind, - Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, - bool HasBaseReg, int64_t Scale) { +static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, + LSRUse::KindType Kind, Type *AccessTy, + GlobalValue *BaseGV, int64_t BaseOffset, + bool HasBaseReg, int64_t Scale) { switch (Kind) { case LSRUse::Address: return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); @@ -1369,10 +1433,11 @@ static bool isLegalUse(const TargetTransformInfo &TTI, LSRUse::KindType Kind, llvm_unreachable("Invalid LSRUse Kind!"); } -static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, - int64_t Scale) { +static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, + int64_t MinOffset, int64_t MaxOffset, + LSRUse::KindType Kind, Type *AccessTy, + GlobalValue *BaseGV, int64_t BaseOffset, + bool HasBaseReg, int64_t Scale) { // Check for overflow. if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) != (MinOffset > 0)) @@ -1383,9 +1448,41 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, return false; MaxOffset = (uint64_t)BaseOffset + MaxOffset; - return isLegalUse(TTI, Kind, AccessTy, BaseGV, MinOffset, HasBaseReg, - Scale) && - isLegalUse(TTI, Kind, AccessTy, BaseGV, MaxOffset, HasBaseReg, Scale); + return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset, + HasBaseReg, Scale) && + isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset, + HasBaseReg, Scale); +} + +static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, + int64_t MinOffset, int64_t MaxOffset, + LSRUse::KindType Kind, Type *AccessTy, + const Formula &F) { + // For the purpose of isAMCompletelyFolded either having a canonical formula + // or a scale not equal to zero is correct. + // Problems may arise from non canonical formulae having a scale == 0. + // Strictly speaking it would best to just rely on canonical formulae. + // However, when we generate the scaled formulae, we first check that the + // scaling factor is profitable before computing the actual ScaledReg for + // compile time sake. + assert((F.isCanonical() || F.Scale != 0)); + return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, + F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); +} + +/// isLegalUse - Test whether we know how to expand the current formula. +static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, + int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, + GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, + int64_t Scale) { + // We know how to expand completely foldable formulae. + return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, + BaseOffset, HasBaseReg, Scale) || + // Or formulae that use a base register produced by a sum of base + // registers. + (Scale == 1 && + isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, + BaseGV, BaseOffset, true, 0)); } static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, @@ -1395,36 +1492,23 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, F.BaseOffset, F.HasBaseReg, F.Scale); } -static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU, - const Formula &F) { - // If F is used as an Addressing Mode, it may fold one Base plus one - // scaled register. If the scaled register is nil, do as if another - // element of the base regs is a 1-scaled register. - // This is possible if BaseRegs has at least 2 registers. - - // If this is not an address calculation, this is not an addressing mode - // use. - if (LU.Kind != LSRUse::Address) - return false; - - // F is already scaled. - if (F.Scale != 0) - return false; - - // We need to keep one register for the base and one to scale. - if (F.BaseRegs.size() < 2) - return false; - - return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, - F.BaseGV, F.BaseOffset, F.HasBaseReg, 1); - } +static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, + const LSRUse &LU, const Formula &F) { + return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, + LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg, + F.Scale); +} static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F) { if (!F.Scale) return 0; - assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, - LU.AccessTy, F) && "Illegal formula in use."); + + // If the use is not completely folded in that instruction, we will have to + // pay an extra cost only for scale != 1. + if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, + LU.AccessTy, F)) + return F.Scale != 1; switch (LU.Kind) { case LSRUse::Address: { @@ -1443,12 +1527,10 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, return std::max(ScaleCostMinOffset, ScaleCostMaxOffset); } case LSRUse::ICmpZero: - // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg. - // Therefore, return 0 in case F.Scale == -1. - return F.Scale != -1; - case LSRUse::Basic: case LSRUse::Special: + // The use is completely folded, i.e., everything is folded into the + // instruction. return 0; } @@ -1473,7 +1555,8 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, HasBaseReg = true; } - return isLegalUse(TTI, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); + return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset, + HasBaseReg, Scale); } static bool isAlwaysFoldable(const TargetTransformInfo &TTI, @@ -1498,36 +1581,12 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, // base and a scale. int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1; - return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, - BaseOffset, HasBaseReg, Scale); + return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, + BaseOffset, HasBaseReg, Scale); } namespace { -/// UseMapDenseMapInfo - A DenseMapInfo implementation for holding -/// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind. -struct UseMapDenseMapInfo { - static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() { - return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic); - } - - static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() { - return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic); - } - - static unsigned - getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) { - unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first); - Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second)); - return Result; - } - - static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS, - const std::pair<const SCEV *, LSRUse::KindType> &RHS) { - return LHS == RHS; - } -}; - /// IVInc - An individual increment in a Chain of IV increments. /// Relate an IV user to an expression that computes the IV it uses from the IV /// used by the previous link in the Chain. @@ -1552,7 +1611,7 @@ struct IVChain { SmallVector<IVInc,1> Incs; const SCEV *ExprBase; - IVChain() : ExprBase(0) {} + IVChain() : ExprBase(nullptr) {} IVChain(const IVInc &Head, const SCEV *Base) : Incs(1, Head), ExprBase(Base) {} @@ -1562,7 +1621,7 @@ struct IVChain { // begin - return the first increment in the chain. const_iterator begin() const { assert(!Incs.empty()); - return llvm::next(Incs.begin()); + return std::next(Incs.begin()); } const_iterator end() const { return Incs.end(); @@ -1656,9 +1715,7 @@ class LSRInstance { } // Support for sharing of LSRUses between LSRFixups. - typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>, - size_t, - UseMapDenseMapInfo> UseMapTy; + typedef DenseMap<LSRUse::SCEVUseKindPair, size_t> UseMapTy; UseMapTy UseMap; bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, @@ -1681,8 +1738,19 @@ class LSRInstance { void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base, unsigned Depth = 0); + + void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, + const Formula &Base, unsigned Depth, + size_t Idx, bool IsScaledReg = false); void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base); + void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx, + const Formula &Base, size_t Idx, + bool IsScaledReg = false); void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); + void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx, + const Formula &Base, + const SmallVectorImpl<int64_t> &Worklist, + size_t Idx, bool IsScaledReg = false); void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base); void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base); @@ -1760,7 +1828,7 @@ void LSRInstance::OptimizeShadowIV() { IVUsers::const_iterator CandidateUI = UI; ++UI; Instruction *ShadowUse = CandidateUI->getUser(); - Type *DestTy = 0; + Type *DestTy = nullptr; bool IsSigned = false; /* If shadow use is a int->float cast then insert a second IV @@ -1822,7 +1890,7 @@ void LSRInstance::OptimizeShadowIV() { continue; /* Initialize new IV, double d = 0.0 in above example. */ - ConstantInt *C = 0; + ConstantInt *C = nullptr; if (Incr->getOperand(0) == PH) C = dyn_cast<ConstantInt>(Incr->getOperand(1)); else if (Incr->getOperand(1) == PH) @@ -1944,7 +2012,7 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { // for ICMP_ULE here because the comparison would be with zero, which // isn't interesting. CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; - const SCEVNAryExpr *Max = 0; + const SCEVNAryExpr *Max = nullptr; if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) { Pred = ICmpInst::ICMP_SLE; Max = S; @@ -1987,7 +2055,7 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { // Check the right operand of the select, and remember it, as it will // be used in the new comparison instruction. - Value *NewRHS = 0; + Value *NewRHS = nullptr; if (ICmpInst::isTrueWhenEqual(Pred)) { // Look for n+1, and grab n. if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1))) @@ -2057,7 +2125,7 @@ LSRInstance::OptimizeLoopTermCond() { continue; // Search IVUsesByStride to find Cond's IVUse if there is one. - IVStrideUse *CondUse = 0; + IVStrideUse *CondUse = nullptr; ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition()); if (!FindIVUserForCond(Cond, CondUse)) continue; @@ -2110,12 +2178,12 @@ LSRInstance::OptimizeLoopTermCond() { // Check for possible scaled-address reuse. Type *AccessTy = getAccessType(UI->getUser()); int64_t Scale = C->getSExtValue(); - if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0, + if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr, /*BaseOffset=*/ 0, /*HasBaseReg=*/ false, Scale)) goto decline_post_inc; Scale = -Scale; - if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0, + if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr, /*BaseOffset=*/ 0, /*HasBaseReg=*/ false, Scale)) goto decline_post_inc; @@ -2185,23 +2253,25 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, // the uses will have all its uses outside the loop, for example. if (LU.Kind != Kind) return false; + + // Check for a mismatched access type, and fall back conservatively as needed. + // TODO: Be less conservative when the type is similar and can use the same + // addressing modes. + if (Kind == LSRUse::Address && AccessTy != LU.AccessTy) + NewAccessTy = Type::getVoidTy(AccessTy->getContext()); + // Conservatively assume HasBaseReg is true for now. if (NewOffset < LU.MinOffset) { - if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, + if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr, LU.MaxOffset - NewOffset, HasBaseReg)) return false; NewMinOffset = NewOffset; } else if (NewOffset > LU.MaxOffset) { - if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, + if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr, NewOffset - LU.MinOffset, HasBaseReg)) return false; NewMaxOffset = NewOffset; } - // Check for a mismatched access type, and fall back conservatively as needed. - // TODO: Be less conservative when the type is similar and can use the same - // addressing modes. - if (Kind == LSRUse::Address && AccessTy != LU.AccessTy) - NewAccessTy = Type::getVoidTy(AccessTy->getContext()); // Update the use. LU.MinOffset = NewMinOffset; @@ -2222,14 +2292,14 @@ LSRInstance::getUse(const SCEV *&Expr, int64_t Offset = ExtractImmediate(Expr, SE); // Basic uses can't accept any offset, for example. - if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, + if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr, Offset, /*HasBaseReg=*/ true)) { Expr = Copy; Offset = 0; } std::pair<UseMapTy::iterator, bool> P = - UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0)); + UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0)); if (!P.second) { // A use already existed with this base. size_t LUIdx = P.first->second; @@ -2306,7 +2376,7 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, } // Nothing looked good. - return 0; + return nullptr; } void LSRInstance::CollectInterestingTypesAndFactors() { @@ -2338,7 +2408,7 @@ void LSRInstance::CollectInterestingTypesAndFactors() { for (SmallSetVector<const SCEV *, 4>::const_iterator I = Strides.begin(), E = Strides.end(); I != E; ++I) for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter = - llvm::next(I); NewStrideIter != E; ++NewStrideIter) { + std::next(I); NewStrideIter != E; ++NewStrideIter) { const SCEV *OldStride = *I; const SCEV *NewStride = *NewStrideIter; @@ -2424,7 +2494,7 @@ static const SCEV *getExprBase(const SCEV *S) { default: // uncluding scUnknown. return S; case scConstant: - return 0; + return nullptr; case scTruncate: return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand()); case scZeroExtend: @@ -2515,7 +2585,7 @@ isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users, && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) { --cost; } - const SCEV *LastIncExpr = 0; + const SCEV *LastIncExpr = nullptr; unsigned NumConstIncrements = 0; unsigned NumVarIncrements = 0; unsigned NumReusedIncrements = 0; @@ -2574,7 +2644,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, // Visit all existing chains. Check if its IVOper can be computed as a // profitable loop invariant increment from the last link in the Chain. unsigned ChainIdx = 0, NChains = IVChainVec.size(); - const SCEV *LastIncExpr = 0; + const SCEV *LastIncExpr = nullptr; for (; ChainIdx < NChains; ++ChainIdx) { IVChain &Chain = IVChainVec[ChainIdx]; @@ -2646,9 +2716,8 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, // they will eventually be used be the current chain, or can be computed // from one of the chain increments. To be more precise we could // transitively follow its user and only add leaf IV users to the set. - for (Value::use_iterator UseIter = IVOper->use_begin(), - UseEnd = IVOper->use_end(); UseIter != UseEnd; ++UseIter) { - Instruction *OtherUse = dyn_cast<Instruction>(*UseIter); + for (User *U : IVOper->users()) { + Instruction *OtherUse = dyn_cast<Instruction>(U); if (!OtherUse) continue; // Uses in the chain will no longer be uses if the chain is formed. @@ -2738,7 +2807,7 @@ void LSRInstance::CollectChains() { Instruction *IVOpInst = cast<Instruction>(*IVOpIter); if (UniqueOperands.insert(IVOpInst)) ChainInstruction(I, IVOpInst, ChainUsersVec); - IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE); + IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE); } } // Continue walking down the instructions. } // Continue walking down the domtree. @@ -2795,7 +2864,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, int64_t IncOffset = IncConst->getValue()->getSExtValue(); if (!isAlwaysFoldable(TTI, LSRUse::Address, - getAccessType(UserInst), /*BaseGV=*/ 0, + getAccessType(UserInst), /*BaseGV=*/ nullptr, IncOffset, /*HaseBaseReg=*/ false)) return false; @@ -2813,7 +2882,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, // findIVOperand returns IVOpEnd if it can no longer find a valid IV user. User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(), IVOpEnd, L, SE); - Value *IVSrc = 0; + Value *IVSrc = nullptr; while (IVOpIter != IVOpEnd) { IVSrc = getWideOperand(*IVOpIter); @@ -2829,7 +2898,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, || SE.getSCEV(IVSrc) == Head.IncExpr) { break; } - IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE); + IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE); } if (IVOpIter == IVOpEnd) { // Gracefully give up on this chain. @@ -2840,7 +2909,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n"); Type *IVTy = IVSrc->getType(); Type *IntTy = SE.getEffectiveSCEVType(IVTy); - const SCEV *LeftOverExpr = 0; + const SCEV *LeftOverExpr = nullptr; for (IVChain::const_iterator IncI = Chain.begin(), IncE = Chain.end(); IncI != IncE; ++IncI) { @@ -2871,7 +2940,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, TTI)) { assert(IVTy == IVOper->getType() && "inconsistent IV increment type"); IVSrc = IVOper; - LeftOverExpr = 0; + LeftOverExpr = nullptr; } } Type *OperTy = IncI->IVOperand->getType(); @@ -2926,7 +2995,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LF.PostIncLoops = UI->getPostIncLoops(); LSRUse::KindType Kind = LSRUse::Basic; - Type *AccessTy = 0; + Type *AccessTy = nullptr; if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) { Kind = LSRUse::Address; AccessTy = getAccessType(LF.UserInst); @@ -2957,7 +3026,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) { // S is normalized, so normalize N before folding it into S // to keep the result normalized. - N = TransformForPostIncUse(Normalize, N, CI, 0, + N = TransformForPostIncUse(Normalize, N, CI, nullptr, LF.PostIncLoops, SE, DT); Kind = LSRUse::ICmpZero; S = SE.getMinusSCEV(N, S); @@ -3032,6 +3101,9 @@ void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) { /// InsertFormula - If the given formula has not yet been inserted, add it to /// the list, and return true. Return false otherwise. bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { + // Do not insert formula that we will not be able to expand. + assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) && + "Formula is illegal"); if (!LU.InsertFormula(F)) return false; @@ -3059,18 +3131,17 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) { Worklist.push_back(D->getLHS()); Worklist.push_back(D->getRHS()); - } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { - if (!Inserted.insert(U)) continue; - const Value *V = U->getValue(); + } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) { + if (!Inserted.insert(US)) continue; + const Value *V = US->getValue(); if (const Instruction *Inst = dyn_cast<Instruction>(V)) { // Look for instructions defined outside the loop. if (L->contains(Inst)) continue; } else if (isa<UndefValue>(V)) // Undef doesn't have a live range, so it doesn't matter. continue; - for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end(); - UI != UE; ++UI) { - const Instruction *UserInst = dyn_cast<Instruction>(*UI); + for (const Use &U : V->uses()) { + const Instruction *UserInst = dyn_cast<Instruction>(U.getUser()); // Ignore non-instructions. if (!UserInst) continue; @@ -3082,7 +3153,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { const BasicBlock *UseBB = !isa<PHINode>(UserInst) ? UserInst->getParent() : cast<PHINode>(UserInst)->getIncomingBlock( - PHINode::getIncomingValueNumForOperand(UI.getOperandNo())); + PHINode::getIncomingValueNumForOperand(U.getOperandNo())); if (!DT.dominates(L->getHeader(), UseBB)) continue; // Ignore uses which are part of other SCEV expressions, to avoid @@ -3092,7 +3163,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { // If the user is a no-op, look through to its uses. if (!isa<SCEVUnknown>(UserS)) continue; - if (UserS == U) { + if (UserS == US) { Worklist.push_back( SE.getUnknown(const_cast<Instruction *>(UserInst))); continue; @@ -3100,7 +3171,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { } // Ignore icmp instructions which are already being analyzed. if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) { - unsigned OtherIdx = !UI.getOperandNo(); + unsigned OtherIdx = !U.getOperandNo(); Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx)); if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L)) continue; @@ -3108,8 +3179,8 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { LSRFixup &LF = getNewFixup(); LF.UserInst = const_cast<Instruction *>(UserInst); - LF.OperandValToReplace = UI.getUse(); - std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0); + LF.OperandValToReplace = U; + std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, nullptr); LF.LUIdx = P.first; LF.Offset = P.second; LSRUse &LU = Uses[LF.LUIdx]; @@ -3118,7 +3189,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { SE.getTypeSizeInBits(LU.WidestFixupType) < SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) LU.WidestFixupType = LF.OperandValToReplace->getType(); - InsertSupplementalFormula(U, LU, LF.LUIdx); + InsertSupplementalFormula(US, LU, LF.LUIdx); CountRegisters(LU.Formulae.back(), Uses.size() - 1); break; } @@ -3148,7 +3219,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, if (Remainder) Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); } - return 0; + return nullptr; } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { // Split a non-zero base out of an addrec. if (AR->getStart()->isZero()) @@ -3160,7 +3231,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, // does not pertain to this loop. if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) { Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); - Remainder = 0; + Remainder = nullptr; } if (Remainder != AR->getStart()) { if (!Remainder) @@ -3182,90 +3253,110 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1); if (Remainder) Ops.push_back(SE.getMulExpr(C, Remainder)); - return 0; + return nullptr; } } return S; } -/// GenerateReassociations - Split out subexpressions from adds and the bases of -/// addrecs. -void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, - Formula Base, - unsigned Depth) { - // Arbitrarily cap recursion to protect compile time. - if (Depth >= 3) return; +/// \brief Helper function for LSRInstance::GenerateReassociations. +void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, + const Formula &Base, + unsigned Depth, size_t Idx, + bool IsScaledReg) { + const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; + SmallVector<const SCEV *, 8> AddOps; + const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE); + if (Remainder) + AddOps.push_back(Remainder); + + if (AddOps.size() == 1) + return; - for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { - const SCEV *BaseReg = Base.BaseRegs[i]; + for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(), + JE = AddOps.end(); + J != JE; ++J) { - SmallVector<const SCEV *, 8> AddOps; - const SCEV *Remainder = CollectSubexprs(BaseReg, 0, AddOps, L, SE); - if (Remainder) - AddOps.push_back(Remainder); + // Loop-variant "unknown" values are uninteresting; we won't be able to + // do anything meaningful with them. + if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L)) + continue; - if (AddOps.size() == 1) continue; + // Don't pull a constant into a register if the constant could be folded + // into an immediate field. + if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, + LU.AccessTy, *J, Base.getNumRegs() > 1)) + continue; - for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(), - JE = AddOps.end(); J != JE; ++J) { + // Collect all operands except *J. + SmallVector<const SCEV *, 8> InnerAddOps( + ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J); + InnerAddOps.append(std::next(J), + ((const SmallVector<const SCEV *, 8> &)AddOps).end()); + + // Don't leave just a constant behind in a register if the constant could + // be folded into an immediate field. + if (InnerAddOps.size() == 1 && + isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, + LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1)) + continue; - // Loop-variant "unknown" values are uninteresting; we won't be able to - // do anything meaningful with them. - if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L)) - continue; + const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); + if (InnerSum->isZero()) + continue; + Formula F = Base; - // Don't pull a constant into a register if the constant could be folded - // into an immediate field. - if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, - LU.AccessTy, *J, Base.getNumRegs() > 1)) - continue; + // Add the remaining pieces of the add back into the new formula. + const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum); + if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && + TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + InnerSumSC->getValue()->getZExtValue())) { + F.UnfoldedOffset = + (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue(); + if (IsScaledReg) + F.ScaledReg = nullptr; + else + F.BaseRegs.erase(F.BaseRegs.begin() + Idx); + } else if (IsScaledReg) + F.ScaledReg = InnerSum; + else + F.BaseRegs[Idx] = InnerSum; + + // Add J as its own register, or an unfolded immediate. + const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J); + if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && + TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + SC->getValue()->getZExtValue())) + F.UnfoldedOffset = + (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue(); + else + F.BaseRegs.push_back(*J); + // We may have changed the number of register in base regs, adjust the + // formula accordingly. + F.Canonicalize(); - // Collect all operands except *J. - SmallVector<const SCEV *, 8> InnerAddOps - (((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J); - InnerAddOps.append - (llvm::next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end()); - - // Don't leave just a constant behind in a register if the constant could - // be folded into an immediate field. - if (InnerAddOps.size() == 1 && - isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, - LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1)) - continue; + if (InsertFormula(LU, LUIdx, F)) + // If that formula hadn't been seen before, recurse to find more like + // it. + GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1); + } +} - const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); - if (InnerSum->isZero()) - continue; - Formula F = Base; +/// GenerateReassociations - Split out subexpressions from adds and the bases of +/// addrecs. +void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, + Formula Base, unsigned Depth) { + assert(Base.isCanonical() && "Input must be in the canonical form"); + // Arbitrarily cap recursion to protect compile time. + if (Depth >= 3) + return; - // Add the remaining pieces of the add back into the new formula. - const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum); - if (InnerSumSC && - SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && - TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + - InnerSumSC->getValue()->getZExtValue())) { - F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + - InnerSumSC->getValue()->getZExtValue(); - F.BaseRegs.erase(F.BaseRegs.begin() + i); - } else - F.BaseRegs[i] = InnerSum; - - // Add J as its own register, or an unfolded immediate. - const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J); - if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && - TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + - SC->getValue()->getZExtValue())) - F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + - SC->getValue()->getZExtValue(); - else - F.BaseRegs.push_back(*J); + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) + GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i); - if (InsertFormula(LU, LUIdx, F)) - // If that formula hadn't been seen before, recurse to find more like - // it. - GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1); - } - } + if (Base.Scale == 1) + GenerateReassociationsImpl(LU, LUIdx, Base, Depth, + /* Idx */ -1, /* IsScaledReg */ true); } /// GenerateCombinations - Generate a formula consisting of all of the @@ -3273,8 +3364,12 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base) { // This method is only interesting on a plurality of registers. - if (Base.BaseRegs.size() <= 1) return; + if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1) + return; + // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before + // processing the formula. + Base.Unscale(); Formula F = Base; F.BaseRegs.clear(); SmallVector<const SCEV *, 4> Ops; @@ -3294,29 +3389,87 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, // rather than proceed with zero in a register. if (!Sum->isZero()) { F.BaseRegs.push_back(Sum); + F.Canonicalize(); (void)InsertFormula(LU, LUIdx, F); } } } +/// \brief Helper function for LSRInstance::GenerateSymbolicOffsets. +void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx, + const Formula &Base, size_t Idx, + bool IsScaledReg) { + const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; + GlobalValue *GV = ExtractSymbol(G, SE); + if (G->isZero() || !GV) + return; + Formula F = Base; + F.BaseGV = GV; + if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) + return; + if (IsScaledReg) + F.ScaledReg = G; + else + F.BaseRegs[Idx] = G; + (void)InsertFormula(LU, LUIdx, F); +} + /// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets. void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base) { // We can't add a symbolic offset if the address already contains one. if (Base.BaseGV) return; - for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { - const SCEV *G = Base.BaseRegs[i]; - GlobalValue *GV = ExtractSymbol(G, SE); - if (G->isZero() || !GV) - continue; + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) + GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i); + if (Base.Scale == 1) + GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1, + /* IsScaledReg */ true); +} + +/// \brief Helper function for LSRInstance::GenerateConstantOffsets. +void LSRInstance::GenerateConstantOffsetsImpl( + LSRUse &LU, unsigned LUIdx, const Formula &Base, + const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) { + const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; + for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(), + E = Worklist.end(); + I != E; ++I) { Formula F = Base; - F.BaseGV = GV; - if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) - continue; - F.BaseRegs[i] = G; - (void)InsertFormula(LU, LUIdx, F); + F.BaseOffset = (uint64_t)Base.BaseOffset - *I; + if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind, + LU.AccessTy, F)) { + // Add the offset to the base register. + const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G); + // If it cancelled out, drop the base register, otherwise update it. + if (NewG->isZero()) { + if (IsScaledReg) { + F.Scale = 0; + F.ScaledReg = nullptr; + } else + F.DeleteBaseReg(F.BaseRegs[Idx]); + F.Canonicalize(); + } else if (IsScaledReg) + F.ScaledReg = NewG; + else + F.BaseRegs[Idx] = NewG; + + (void)InsertFormula(LU, LUIdx, F); + } } + + int64_t Imm = ExtractImmediate(G, SE); + if (G->isZero() || Imm == 0) + return; + Formula F = Base; + F.BaseOffset = (uint64_t)F.BaseOffset + Imm; + if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) + return; + if (IsScaledReg) + F.ScaledReg = G; + else + F.BaseRegs[Idx] = G; + (void)InsertFormula(LU, LUIdx, F); } /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets. @@ -3329,38 +3482,11 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, if (LU.MaxOffset != LU.MinOffset) Worklist.push_back(LU.MaxOffset); - for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { - const SCEV *G = Base.BaseRegs[i]; - - for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(), - E = Worklist.end(); I != E; ++I) { - Formula F = Base; - F.BaseOffset = (uint64_t)Base.BaseOffset - *I; - if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind, - LU.AccessTy, F)) { - // Add the offset to the base register. - const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G); - // If it cancelled out, drop the base register, otherwise update it. - if (NewG->isZero()) { - std::swap(F.BaseRegs[i], F.BaseRegs.back()); - F.BaseRegs.pop_back(); - } else - F.BaseRegs[i] = NewG; - - (void)InsertFormula(LU, LUIdx, F); - } - } - - int64_t Imm = ExtractImmediate(G, SE); - if (G->isZero() || Imm == 0) - continue; - Formula F = Base; - F.BaseOffset = (uint64_t)F.BaseOffset + Imm; - if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) - continue; - F.BaseRegs[i] = G; - (void)InsertFormula(LU, LUIdx, F); - } + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) + GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i); + if (Base.Scale == 1) + GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1, + /* IsScaledReg */ true); } /// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up @@ -3460,7 +3586,11 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { if (!IntTy) return; // If this Formula already has a scaled register, we can't add another one. - if (Base.Scale != 0) return; + // Try to unscale the formula to generate a better scale. + if (Base.Scale != 0 && !Base.Unscale()) + return; + + assert(Base.Scale == 0 && "Unscale did not did its job!"); // Check each interesting stride. for (SmallSetVector<int64_t, 8>::const_iterator @@ -3501,6 +3631,11 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { Formula F = Base; F.ScaledReg = Quotient; F.DeleteBaseReg(F.BaseRegs[i]); + // The canonical representation of 1*reg is reg, which is already in + // Base. In that case, do not try to insert the formula, it will be + // rejected anyway. + if (F.Scale == 1 && F.BaseRegs.empty()) + continue; (void)InsertFormula(LU, LUIdx, F); } } @@ -3626,8 +3761,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // Conservatively examine offsets between this orig reg a few selected // other orig regs. ImmMapTy::const_iterator OtherImms[] = { - Imms.begin(), prior(Imms.end()), - Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2) + Imms.begin(), std::prev(Imms.end()), + Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) / + 2) }; for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) { ImmMapTy::const_iterator M = OtherImms[i]; @@ -3664,7 +3800,12 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // TODO: Use a more targeted data structure. for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) { - const Formula &F = LU.Formulae[L]; + Formula F = LU.Formulae[L]; + // FIXME: The code for the scaled and unscaled registers looks + // very similar but slightly different. Investigate if they + // could be merged. That way, we would not have to unscale the + // Formula. + F.Unscale(); // Use the immediate in the scaled register. if (F.ScaledReg == OrigReg) { int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale; @@ -3690,6 +3831,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { continue; // OK, looks good. + NewF.Canonicalize(); (void)InsertFormula(LU, LUIdx, NewF); } else { // Use the immediate in a base register. @@ -3723,6 +3865,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { goto skip_formula; // Ok, looks good. + NewF.Canonicalize(); (void)InsertFormula(LU, LUIdx, NewF); break; skip_formula:; @@ -3976,7 +4119,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), E = LU.Formulae.end(); I != E; ++I) { const Formula &F = *I; - if (F.BaseOffset == 0 || F.Scale != 0) + if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1)) continue; LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU); @@ -4073,7 +4216,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { // Pick the register which is used by the most LSRUses, which is likely // to be a good reuse register candidate. - const SCEV *Best = 0; + const SCEV *Best = nullptr; unsigned BestNum = 0; for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end(); I != E; ++I) { @@ -4170,19 +4313,22 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, E = LU.Formulae.end(); I != E; ++I) { const Formula &F = *I; - // Ignore formulae which do not use any of the required registers. - bool SatisfiedReqReg = true; + // Ignore formulae which may not be ideal in terms of register reuse of + // ReqRegs. The formula should use all required registers before + // introducing new ones. + int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size()); for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(), JE = ReqRegs.end(); J != JE; ++J) { const SCEV *Reg = *J; - if ((!F.ScaledReg || F.ScaledReg != Reg) && - std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) == + if ((F.ScaledReg && F.ScaledReg == Reg) || + std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) != F.BaseRegs.end()) { - SatisfiedReqReg = false; - break; + --NumReqRegsToFind; + if (NumReqRegsToFind == 0) + break; } } - if (!SatisfiedReqReg) { + if (NumReqRegsToFind != 0) { // If none of the formulae satisfied the required registers, then we could // clear ReqRegs and try again. Currently, we simply give up in this case. continue; @@ -4222,7 +4368,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { SmallVector<const Formula *, 8> Workspace; Cost SolutionCost; - SolutionCost.Loose(); + SolutionCost.Lose(); Cost CurCost; SmallPtrSet<const SCEV *, 16> CurRegs; DenseSet<const SCEV *> VisitedRegs; @@ -4280,7 +4426,7 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, } bool AllDominate = true; - Instruction *BetterPos = 0; + Instruction *BetterPos = nullptr; Instruction *Tentative = IDom->getTerminator(); for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(), E = Inputs.end(); I != E; ++I) { @@ -4293,7 +4439,7 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, // instead of at the end, so that it can be used for other expansions. if (IDom == Inst->getParent() && (!BetterPos || !DT.dominates(Inst, BetterPos))) - BetterPos = llvm::next(BasicBlock::iterator(Inst)); + BetterPos = std::next(BasicBlock::iterator(Inst)); } if (!AllDominate) break; @@ -4419,11 +4565,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF, LF.UserInst, LF.OperandValToReplace, Loops, SE, DT); - Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP))); + Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, IP))); } // Expand the ScaledReg portion. - Value *ICmpScaledV = 0; + Value *ICmpScaledV = nullptr; if (F.Scale != 0) { const SCEV *ScaledS = F.ScaledReg; @@ -4434,25 +4580,34 @@ Value *LSRInstance::Expand(const LSRFixup &LF, Loops, SE, DT); if (LU.Kind == LSRUse::ICmpZero) { - // An interesting way of "folding" with an icmp is to use a negated - // scale, which we'll implement by inserting it into the other operand - // of the icmp. - assert(F.Scale == -1 && - "The only scale supported by ICmpZero uses is -1!"); - ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP); + // Expand ScaleReg as if it was part of the base regs. + if (F.Scale == 1) + Ops.push_back( + SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP))); + else { + // An interesting way of "folding" with an icmp is to use a negated + // scale, which we'll implement by inserting it into the other operand + // of the icmp. + assert(F.Scale == -1 && + "The only scale supported by ICmpZero uses is -1!"); + ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, IP); + } } else { // Otherwise just expand the scaled register and an explicit scale, // which is expected to be matched as part of the address. // Flush the operand list to suppress SCEVExpander hoisting address modes. - if (!Ops.empty() && LU.Kind == LSRUse::Address) { + // Unless the addressing mode will not be folded. + if (!Ops.empty() && LU.Kind == LSRUse::Address && + isAMCompletelyFolded(TTI, LU, F)) { Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } - ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP)); - ScaledS = SE.getMulExpr(ScaledS, - SE.getConstant(ScaledS->getType(), F.Scale)); + ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP)); + if (F.Scale != 1) + ScaledS = + SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale)); Ops.push_back(ScaledS); } } @@ -4530,7 +4685,9 @@ Value *LSRInstance::Expand(const LSRFixup &LF, } CI->setOperand(1, ICmpScaledV); } else { - assert(F.Scale == 0 && + // A scale of 1 means that the scale has been expanded as part of the + // base regs. + assert((F.Scale == 0 || F.Scale == 1) && "ICmp does not support folding a global value and " "a scale at the same time!"); Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy), @@ -4571,7 +4728,7 @@ void LSRInstance::RewriteForPHI(PHINode *PN, Loop *PNLoop = LI.getLoopFor(Parent); if (!PNLoop || Parent != PNLoop->getHeader()) { // Split the critical edge. - BasicBlock *NewBB = 0; + BasicBlock *NewBB = nullptr; if (!Parent->isLandingPad()) { NewBB = SplitCriticalEdge(BB, Parent, P, /*MergeIdenticalEdges=*/true, @@ -4600,7 +4757,7 @@ void LSRInstance::RewriteForPHI(PHINode *PN, } std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair = - Inserted.insert(std::make_pair(BB, static_cast<Value *>(0))); + Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr))); if (!Pair.second) PN->setIncomingValue(i, Pair.first->second); else { @@ -4707,9 +4864,10 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, LSRInstance::LSRInstance(Loop *L, Pass *P) : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), - DT(P->getAnalysis<DominatorTree>()), LI(P->getAnalysis<LoopInfo>()), + DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()), + LI(P->getAnalysis<LoopInfo>()), TTI(P->getAnalysis<TargetTransformInfo>()), L(L), Changed(false), - IVIncInsertPos(0) { + IVIncInsertPos(nullptr) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) return; @@ -4746,7 +4904,7 @@ LSRInstance::LSRInstance(Loop *L, Pass *P) #endif // DEBUG DEBUG(dbgs() << "\nLSR on loop "; - WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false); + L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false); dbgs() << ":\n"); // First, perform some low-level loop optimizations. @@ -4876,8 +5034,8 @@ public: LoopStrengthReduce(); private: - bool runOnLoop(Loop *L, LPPassManager &LPM); - void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnLoop(Loop *L, LPPassManager &LPM) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; }; } @@ -4886,7 +5044,7 @@ char LoopStrengthReduce::ID = 0; INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(IVUsers) INITIALIZE_PASS_DEPENDENCY(LoopInfo) @@ -4911,8 +5069,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<LoopInfo>(); AU.addPreserved<LoopInfo>(); AU.addRequiredID(LoopSimplifyID); - AU.addRequired<DominatorTree>(); - AU.addPreserved<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<ScalarEvolution>(); AU.addPreserved<ScalarEvolution>(); // Requiring LoopSimplify a second time here prevents IVUsers from running @@ -4924,6 +5082,9 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { } bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { + if (skipOptnoneFunction(L)) + return false; + bool Changed = false; // Run the main LSR transformation. @@ -4937,10 +5098,9 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { #ifndef NDEBUG Rewriter.setDebugType(DEBUG_TYPE); #endif - unsigned numFolded = - Rewriter.replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), - DeadInsts, - &getAnalysis<TargetTransformInfo>()); + unsigned numFolded = Rewriter.replaceCongruentIVs( + L, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), DeadInsts, + &getAnalysis<TargetTransformInfo>()); if (numFolded) { Changed = true; DeleteTriviallyDeadInstructions(DeadInsts); diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 08ac38d..935f289 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -12,14 +12,16 @@ // counts of loops easily. //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-unroll" #include "llvm/Transforms/Scalar.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Metadata.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -28,13 +30,16 @@ using namespace llvm; +#define DEBUG_TYPE "loop-unroll" + static cl::opt<unsigned> UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden, cl::desc("The cut-off point for automatic loop unrolling")); static cl::opt<unsigned> UnrollCount("unroll-count", cl::init(0), cl::Hidden, - cl::desc("Use this unroll count for all loops, for testing purposes")); + cl::desc("Use this unroll count for all loops including those with " + "unroll_count pragma values, for testing purposes")); static cl::opt<bool> UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden, @@ -45,6 +50,11 @@ static cl::opt<bool> UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::init(false), cl::Hidden, cl::desc("Unroll loops with run-time trip counts")); +static cl::opt<unsigned> +PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden, + cl::desc("Unrolled size limit for loops with an unroll(enable) or " + "unroll_count pragma.")); + namespace { class LoopUnroll : public LoopPass { public: @@ -86,12 +96,12 @@ namespace { bool UserAllowPartial; // CurrentAllowPartial is user-specified. bool UserRuntime; // CurrentRuntime is user-specified. - bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; /// This transformation requires natural loop information & requires that /// loop preheaders be inserted into the CFG... /// - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LoopInfo>(); AU.addPreserved<LoopInfo>(); AU.addRequiredID(LoopSimplifyID); @@ -105,7 +115,67 @@ namespace { // If loop unroll does not preserve dom info then LCSSA pass on next // loop will receive invalid dom info. // For now, recreate dom info, if loop is unrolled. - AU.addPreserved<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + } + + // Fill in the UnrollingPreferences parameter with values from the + // TargetTransformationInfo. + void getUnrollingPreferences(Loop *L, const TargetTransformInfo &TTI, + TargetTransformInfo::UnrollingPreferences &UP) { + UP.Threshold = CurrentThreshold; + UP.OptSizeThreshold = OptSizeUnrollThreshold; + UP.PartialThreshold = CurrentThreshold; + UP.PartialOptSizeThreshold = OptSizeUnrollThreshold; + UP.Count = CurrentCount; + UP.MaxCount = UINT_MAX; + UP.Partial = CurrentAllowPartial; + UP.Runtime = CurrentRuntime; + TTI.getUnrollingPreferences(L, UP); + } + + // Select and return an unroll count based on parameters from + // user, unroll preferences, unroll pragmas, or a heuristic. + // SetExplicitly is set to true if the unroll count is is set by + // the user or a pragma rather than selected heuristically. + unsigned + selectUnrollCount(const Loop *L, unsigned TripCount, bool HasEnablePragma, + unsigned PragmaCount, + const TargetTransformInfo::UnrollingPreferences &UP, + bool &SetExplicitly); + + + // Select threshold values used to limit unrolling based on a + // total unrolled size. Parameters Threshold and PartialThreshold + // are set to the maximum unrolled size for fully and partially + // unrolled loops respectively. + void selectThresholds(const Loop *L, bool HasPragma, + const TargetTransformInfo::UnrollingPreferences &UP, + unsigned &Threshold, unsigned &PartialThreshold) { + // Determine the current unrolling threshold. While this is + // normally set from UnrollThreshold, it is overridden to a + // smaller value if the current function is marked as + // optimize-for-size, and the unroll threshold was not user + // specified. + Threshold = UserThreshold ? CurrentThreshold : UP.Threshold; + PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold; + if (!UserThreshold && + L->getHeader()->getParent()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, + Attribute::OptimizeForSize)) { + Threshold = UP.OptSizeThreshold; + PartialThreshold = UP.PartialOptSizeThreshold; + } + if (HasPragma) { + // If the loop has an unrolling pragma, we want to be more + // aggressive with unrolling limits. Set thresholds to at + // least the PragmaTheshold value which is larger than the + // default limits. + if (Threshold != NoThreshold) + Threshold = std::max<unsigned>(Threshold, PragmaUnrollThreshold); + if (PartialThreshold != NoThreshold) + PartialThreshold = + std::max<unsigned>(PartialThreshold, PragmaUnrollThreshold); + } } }; } @@ -124,6 +194,10 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, return new LoopUnroll(Threshold, Count, AllowPartial, Runtime); } +Pass *llvm::createSimpleLoopUnrollPass() { + return llvm::createLoopUnrollPass(-1, -1, 0, 0); +} + /// ApproximateLoopSize - Approximate the size of the loop. static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, @@ -145,7 +219,144 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, return LoopSize; } +// Returns the value associated with the given metadata node name (for +// example, "llvm.loop.unroll.count"). If no such named metadata node +// exists, then nullptr is returned. +static const ConstantInt *GetUnrollMetadataValue(const Loop *L, + StringRef Name) { + MDNode *LoopID = L->getLoopID(); + if (!LoopID) return nullptr; + + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) { + const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); + if (!MD) continue; + + const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); + if (!S) continue; + + if (Name.equals(S->getString())) { + assert(MD->getNumOperands() == 2 && + "Unroll hint metadata should have two operands."); + return cast<ConstantInt>(MD->getOperand(1)); + } + } + return nullptr; +} + +// Returns true if the loop has an unroll(enable) pragma. +static bool HasUnrollEnablePragma(const Loop *L) { + const ConstantInt *EnableValue = + GetUnrollMetadataValue(L, "llvm.loop.unroll.enable"); + return (EnableValue && EnableValue->getZExtValue()); +} + +// Returns true if the loop has an unroll(disable) pragma. +static bool HasUnrollDisablePragma(const Loop *L) { + const ConstantInt *EnableValue = + GetUnrollMetadataValue(L, "llvm.loop.unroll.enable"); + return (EnableValue && !EnableValue->getZExtValue()); +} + +// If loop has an unroll_count pragma return the (necessarily +// positive) value from the pragma. Otherwise return 0. +static unsigned UnrollCountPragmaValue(const Loop *L) { + const ConstantInt *CountValue = + GetUnrollMetadataValue(L, "llvm.loop.unroll.count"); + if (CountValue) { + unsigned Count = CountValue->getZExtValue(); + assert(Count >= 1 && "Unroll count must be positive."); + return Count; + } + return 0; +} + +// Remove existing unroll metadata and add unroll disable metadata to +// indicate the loop has already been unrolled. This prevents a loop +// from being unrolled more than is directed by a pragma if the loop +// unrolling pass is run more than once (which it generally is). +static void SetLoopAlreadyUnrolled(Loop *L) { + MDNode *LoopID = L->getLoopID(); + if (!LoopID) return; + + // First remove any existing loop unrolling metadata. + SmallVector<Value *, 4> Vals; + // Reserve first location for self reference to the LoopID metadata node. + Vals.push_back(nullptr); + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + bool IsUnrollMetadata = false; + MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); + if (MD) { + const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); + IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll."); + } + if (!IsUnrollMetadata) Vals.push_back(LoopID->getOperand(i)); + } + + // Add unroll(disable) metadata to disable future unrolling. + LLVMContext &Context = L->getHeader()->getContext(); + SmallVector<Value *, 2> DisableOperands; + DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.enable")); + DisableOperands.push_back(ConstantInt::get(Type::getInt1Ty(Context), 0)); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + Vals.push_back(DisableNode); + + MDNode *NewLoopID = MDNode::get(Context, Vals); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + L->setLoopID(NewLoopID); + LoopID->replaceAllUsesWith(NewLoopID); +} + +unsigned LoopUnroll::selectUnrollCount( + const Loop *L, unsigned TripCount, bool HasEnablePragma, + unsigned PragmaCount, const TargetTransformInfo::UnrollingPreferences &UP, + bool &SetExplicitly) { + SetExplicitly = true; + + // User-specified count (either as a command-line option or + // constructor parameter) has highest precedence. + unsigned Count = UserCount ? CurrentCount : 0; + + // If there is no user-specified count, unroll pragmas have the next + // highest precendence. + if (Count == 0) { + if (PragmaCount) { + Count = PragmaCount; + } else if (HasEnablePragma) { + // unroll(enable) pragma without an unroll_count pragma + // indicates to unroll loop fully. + Count = TripCount; + } + } + + if (Count == 0) + Count = UP.Count; + + if (Count == 0) { + SetExplicitly = false; + if (TripCount == 0) + // Runtime trip count. + Count = UnrollRuntimeCount; + else + // Conservative heuristic: if we know the trip count, see if we can + // completely unroll (subject to the threshold, checked below); otherwise + // try to find greatest modulo of the trip count which is still under + // threshold value. + Count = TripCount; + } + if (TripCount && Count > TripCount) + return TripCount; + return Count; +} + bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) + return false; + LoopInfo *LI = &getAnalysis<LoopInfo>(); ScalarEvolution *SE = &getAnalysis<ScalarEvolution>(); const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); @@ -153,26 +364,16 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() << "] Loop %" << Header->getName() << "\n"); - (void)Header; + + if (HasUnrollDisablePragma(L)) { + return false; + } + bool HasEnablePragma = HasUnrollEnablePragma(L); + unsigned PragmaCount = UnrollCountPragmaValue(L); + bool HasPragma = HasEnablePragma || PragmaCount > 0; TargetTransformInfo::UnrollingPreferences UP; - UP.Threshold = CurrentThreshold; - UP.OptSizeThreshold = OptSizeUnrollThreshold; - UP.Count = CurrentCount; - UP.Partial = CurrentAllowPartial; - UP.Runtime = CurrentRuntime; - TTI.getUnrollingPreferences(L, UP); - - // Determine the current unrolling threshold. While this is normally set - // from UnrollThreshold, it is overridden to a smaller value if the current - // function is marked as optimize-for-size, and the unroll threshold was - // not user specified. - unsigned Threshold = UserThreshold ? CurrentThreshold : UP.Threshold; - if (!UserThreshold && - Header->getParent()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize)) - Threshold = UP.OptSizeThreshold; + getUnrollingPreferences(L, TTI, UP); // Find trip count and trip multiple if count is not available unsigned TripCount = 0; @@ -186,74 +387,121 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock); } - bool Runtime = UserRuntime ? CurrentRuntime : UP.Runtime; + // Select an initial unroll count. This may be reduced later based + // on size thresholds. + bool CountSetExplicitly; + unsigned Count = selectUnrollCount(L, TripCount, HasEnablePragma, PragmaCount, + UP, CountSetExplicitly); + + unsigned NumInlineCandidates; + bool notDuplicatable; + unsigned LoopSize = + ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI); + DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); + uint64_t UnrolledSize = (uint64_t)LoopSize * Count; + if (notDuplicatable) { + DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" + << " instructions.\n"); + return false; + } + if (NumInlineCandidates != 0) { + DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); + return false; + } - // Use a default unroll-count if the user doesn't specify a value - // and the trip count is a run-time value. The default is different - // for run-time or compile-time trip count loops. - unsigned Count = UserCount ? CurrentCount : UP.Count; - if (Runtime && Count == 0 && TripCount == 0) - Count = UnrollRuntimeCount; + unsigned Threshold, PartialThreshold; + selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold); - if (Count == 0) { - // Conservative heuristic: if we know the trip count, see if we can - // completely unroll (subject to the threshold, checked below); otherwise - // try to find greatest modulo of the trip count which is still under - // threshold value. - if (TripCount == 0) - return false; - Count = TripCount; + // Given Count, TripCount and thresholds determine the type of + // unrolling which is to be performed. + enum { Full = 0, Partial = 1, Runtime = 2 }; + int Unrolling; + if (TripCount && Count == TripCount) { + if (Threshold != NoThreshold && UnrolledSize > Threshold) { + DEBUG(dbgs() << " Too large to fully unroll with count: " << Count + << " because size: " << UnrolledSize << ">" << Threshold + << "\n"); + Unrolling = Partial; + } else { + Unrolling = Full; + } + } else if (TripCount && Count < TripCount) { + Unrolling = Partial; + } else { + Unrolling = Runtime; } - // Enforce the threshold. - if (Threshold != NoThreshold) { - unsigned NumInlineCandidates; - bool notDuplicatable; - unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates, - notDuplicatable, TTI); - DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); - if (notDuplicatable) { - DEBUG(dbgs() << " Not unrolling loop which contains non duplicatable" - << " instructions.\n"); + // Reduce count based on the type of unrolling and the threshold values. + unsigned OriginalCount = Count; + bool AllowRuntime = UserRuntime ? CurrentRuntime : UP.Runtime; + if (Unrolling == Partial) { + bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial; + if (!AllowPartial && !CountSetExplicitly) { + DEBUG(dbgs() << " will not try to unroll partially because " + << "-unroll-allow-partial not given\n"); return false; } - if (NumInlineCandidates != 0) { - DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); + if (PartialThreshold != NoThreshold && UnrolledSize > PartialThreshold) { + // Reduce unroll count to be modulo of TripCount for partial unrolling. + Count = PartialThreshold / LoopSize; + while (Count != 0 && TripCount % Count != 0) + Count--; + } + } else if (Unrolling == Runtime) { + if (!AllowRuntime && !CountSetExplicitly) { + DEBUG(dbgs() << " will not try to unroll loop with runtime trip count " + << "-unroll-runtime not given\n"); return false; } - uint64_t Size = (uint64_t)LoopSize*Count; - if (TripCount != 1 && Size > Threshold) { - DEBUG(dbgs() << " Too large to fully unroll with count: " << Count - << " because size: " << Size << ">" << Threshold << "\n"); - bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial; - if (!AllowPartial && !(Runtime && TripCount == 0)) { - DEBUG(dbgs() << " will not try to unroll partially because " - << "-unroll-allow-partial not given\n"); - return false; - } - if (TripCount) { - // Reduce unroll count to be modulo of TripCount for partial unrolling - Count = Threshold / LoopSize; - while (Count != 0 && TripCount%Count != 0) - Count--; - } - else if (Runtime) { - // Reduce unroll count to be a lower power-of-two value - while (Count != 0 && Size > Threshold) { - Count >>= 1; - Size = LoopSize*Count; - } - } - if (Count < 2) { - DEBUG(dbgs() << " could not unroll partially\n"); - return false; + // Reduce unroll count to be the largest power-of-two factor of + // the original count which satisfies the threshold limit. + while (Count != 0 && UnrolledSize > PartialThreshold) { + Count >>= 1; + UnrolledSize = LoopSize * Count; + } + if (Count > UP.MaxCount) + Count = UP.MaxCount; + DEBUG(dbgs() << " partially unrolling with count: " << Count << "\n"); + } + + if (HasPragma) { + // Mark loop as unrolled to prevent unrolling beyond that + // requested by the pragma. + SetLoopAlreadyUnrolled(L); + + // Emit optimization remarks if we are unable to unroll the loop + // as directed by a pragma. + DebugLoc LoopLoc = L->getStartLoc(); + Function *F = Header->getParent(); + LLVMContext &Ctx = F->getContext(); + if (HasEnablePragma && PragmaCount == 0) { + if (TripCount && Count != TripCount) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to fully unroll loop as directed by unroll(enable) pragma " + "because unrolled size is too large."); + } else if (!TripCount) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to fully unroll loop as directed by unroll(enable) pragma " + "because loop has a runtime trip count."); } - DEBUG(dbgs() << " partially unrolling with count: " << Count << "\n"); + } else if (PragmaCount > 0 && Count != OriginalCount) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to unroll loop the number of times directed by " + "unroll_count pragma because unrolled size is too large."); } } + if (Unrolling != Full && Count < 2) { + // Partial unrolling by 1 is a nop. For full unrolling, a factor + // of 1 makes sense because loop control can be eliminated. + return false; + } + // Unroll the loop. - if (!UnrollLoop(L, Count, TripCount, Runtime, TripMultiple, LI, &LPM)) + if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this, &LPM)) return false; return true; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index c4ebfd5..977c53a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -26,13 +26,11 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-unswitch" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" @@ -40,6 +38,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/Support/CommandLine.h" @@ -53,6 +52,8 @@ #include <set> using namespace llvm; +#define DEBUG_TYPE "loop-unswitch" + STATISTIC(NumBranches, "Number of branches unswitched"); STATISTIC(NumSwitches, "Number of switches unswitched"); STATISTIC(NumSelects , "Number of selects unswitched"); @@ -96,7 +97,7 @@ namespace { public: LUAnalysisCache() : - CurLoopInstructions(0), CurrentLoopProperties(0), + CurLoopInstructions(nullptr), CurrentLoopProperties(nullptr), MaxSize(Threshold) {} @@ -151,44 +152,35 @@ namespace { static char ID; // Pass ID, replacement for typeid explicit LoopUnswitch(bool Os = false) : LoopPass(ID), OptimizeForSize(Os), redoLoop(false), - currentLoop(0), DT(0), loopHeader(0), - loopPreheader(0) { + currentLoop(nullptr), DT(nullptr), loopHeader(nullptr), + loopPreheader(nullptr) { initializeLoopUnswitchPass(*PassRegistry::getPassRegistry()); } - bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; bool processCurrentLoop(); /// This transformation requires natural loop information & requires that /// loop preheaders be inserted into the CFG. /// - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequired<LoopInfo>(); AU.addPreserved<LoopInfo>(); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); - AU.addPreserved<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<ScalarEvolution>(); AU.addRequired<TargetTransformInfo>(); } private: - virtual void releaseMemory() { + void releaseMemory() override { BranchesInfo.forgetLoop(currentLoop); } - /// RemoveLoopFromWorklist - If the specified loop is on the loop worklist, - /// remove it. - void RemoveLoopFromWorklist(Loop *L) { - std::vector<Loop*>::iterator I = std::find(LoopProcessWorklist.begin(), - LoopProcessWorklist.end(), L); - if (I != LoopProcessWorklist.end()) - LoopProcessWorklist.erase(I); - } - void initLoopData() { loopHeader = currentLoop->getHeader(); loopPreheader = currentLoop->getLoopPreheader(); @@ -212,9 +204,8 @@ namespace { Instruction *InsertPt); void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L); - void RemoveLoopFromHierarchy(Loop *L); - bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = 0, - BasicBlock **LoopExit = 0); + bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = nullptr, + BasicBlock **LoopExit = nullptr); }; } @@ -225,7 +216,7 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) { LoopPropsMapIt PropsIt; bool Inserted; - llvm::tie(PropsIt, Inserted) = + std::tie(PropsIt, Inserted) = LoopsProperties.insert(std::make_pair(L, LoopProperties())); LoopProperties &Props = PropsIt->second; @@ -283,8 +274,8 @@ void LUAnalysisCache::forgetLoop(const Loop *L) { LoopsProperties.erase(LIt); } - CurrentLoopProperties = 0; - CurLoopInstructions = 0; + CurrentLoopProperties = nullptr; + CurLoopInstructions = nullptr; } // Mark case value as unswitched. @@ -355,10 +346,10 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) { // We can never unswitch on vector conditions. if (Cond->getType()->isVectorTy()) - return 0; + return nullptr; // Constants should be folded, not unswitched on! - if (isa<Constant>(Cond)) return 0; + if (isa<Constant>(Cond)) return nullptr; // TODO: Handle: br (VARIANT|INVARIANT). @@ -378,13 +369,18 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) { return RHS; } - return 0; + return nullptr; } bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { + if (skipOptnoneFunction(L)) + return false; + LI = &getAnalysis<LoopInfo>(); LPM = &LPM_Ref; - DT = getAnalysisIfAvailable<DominatorTree>(); + DominatorTreeWrapperPass *DTWP = + getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; currentLoop = L; Function *F = currentLoop->getHeader()->getParent(); bool Changed = false; @@ -397,7 +393,7 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { if (Changed) { // FIXME: Reconstruct dom info, because it is not preserved properly. if (DT) - DT->runOnFunction(*F); + DT->recalculate(*F); } return Changed; } @@ -456,7 +452,7 @@ bool LoopUnswitch::processCurrentLoop() { // Find a value to unswitch on: // FIXME: this should chose the most expensive case! // FIXME: scan for a case with a non-critical edge? - Constant *UnswitchVal = 0; + Constant *UnswitchVal = nullptr; // Do not process same value again and again. // At this point we have some cases already unswitched and @@ -513,7 +509,7 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB, if (!L->contains(BB)) { // Otherwise, this is a loop exit, this is fine so long as this is the // first exit. - if (ExitBB != 0) return false; + if (ExitBB) return false; ExitBB = BB; return true; } @@ -540,10 +536,10 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB, static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) { std::set<BasicBlock*> Visited; Visited.insert(L->getHeader()); // Branches to header make infinite loops. - BasicBlock *ExitBB = 0; + BasicBlock *ExitBB = nullptr; if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited)) return ExitBB; - return 0; + return nullptr; } /// IsTrivialUnswitchCondition - Check to see if this unswitch condition is @@ -564,7 +560,7 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val, TerminatorInst *HeaderTerm = Header->getTerminator(); LLVMContext &Context = Header->getContext(); - BasicBlock *LoopExitBB = 0; + BasicBlock *LoopExitBB = nullptr; if (BranchInst *BI = dyn_cast<BranchInst>(HeaderTerm)) { // If the header block doesn't end with a conditional branch on Cond, we // can't handle it. @@ -634,8 +630,8 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val, /// unswitch the loop, reprocess the pieces, then return true. bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) { Function *F = loopHeader->getParent(); - Constant *CondVal = 0; - BasicBlock *ExitBlock = 0; + Constant *CondVal = nullptr; + BasicBlock *ExitBlock = nullptr; if (IsTrivialUnswitchCondition(LoopCond, &CondVal, &ExitBlock)) { // If the condition is trivial, always unswitch. There is no code growth @@ -934,9 +930,8 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V, Worklist.push_back(Use); // Add users to the worklist which may be simplified now. - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); - UI != E; ++UI) - Worklist.push_back(cast<Instruction>(*UI)); + for (User *U : I->users()) + Worklist.push_back(cast<Instruction>(U)); LPM->deleteSimpleAnalysisValue(I, L); RemoveFromWorklist(I, Worklist); I->replaceAllUsesWith(V); @@ -944,17 +939,6 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V, ++NumSimplify; } -/// RemoveLoopFromHierarchy - We have discovered that the specified loop has -/// become unwrapped, either because the backedge was deleted, or because the -/// edge into the header was removed. If the edge into the header from the -/// latch block was removed, the loop is unwrapped but subloops are still alive, -/// so they just reparent loops. If the loops are actually dead, they will be -/// removed later. -void LoopUnswitch::RemoveLoopFromHierarchy(Loop *L) { - LPM->deleteLoopFromQueue(L); - RemoveLoopFromWorklist(L); -} - // RewriteLoopBodyWithConditionConstant - We know either that the value LIC has // the value specified by Val in the specified loop, or we know it does NOT have // that value. Rewrite any uses of LIC or of properties correlated to it. @@ -986,12 +970,11 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()), !cast<ConstantInt>(Val)->getZExtValue()); - for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end(); - UI != E; ++UI) { - Instruction *U = dyn_cast<Instruction>(*UI); - if (!U || !L->contains(U)) + for (User *U : LIC->users()) { + Instruction *UI = dyn_cast<Instruction>(U); + if (!UI || !L->contains(UI)) continue; - Worklist.push_back(U); + Worklist.push_back(UI); } for (std::vector<Instruction*>::iterator UI = Worklist.begin(), @@ -1005,20 +988,19 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, // Otherwise, we don't know the precise value of LIC, but we do know that it // is certainly NOT "Val". As such, simplify any uses in the loop that we // can. This case occurs when we unswitch switch statements. - for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end(); - UI != E; ++UI) { - Instruction *U = dyn_cast<Instruction>(*UI); - if (!U || !L->contains(U)) + for (User *U : LIC->users()) { + Instruction *UI = dyn_cast<Instruction>(U); + if (!UI || !L->contains(UI)) continue; - Worklist.push_back(U); + Worklist.push_back(UI); // TODO: We could do other simplifications, for example, turning // 'icmp eq LIC, Val' -> false. // If we know that LIC is not Val, use this info to simplify code. - SwitchInst *SI = dyn_cast<SwitchInst>(U); - if (SI == 0 || !isa<ConstantInt>(Val)) continue; + SwitchInst *SI = dyn_cast<SwitchInst>(UI); + if (!SI || !isa<ConstantInt>(Val)) continue; SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val)); // Default case is live for multiple values. diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp index 8ced494..3314e1e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp @@ -12,7 +12,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loweratomic" #include "llvm/Transforms/Scalar.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -20,6 +19,8 @@ #include "llvm/Pass.h" using namespace llvm; +#define DEBUG_TYPE "loweratomic" + static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { IRBuilder<> Builder(CXI->getParent(), CXI); Value *Ptr = CXI->getPointerOperand(); @@ -31,7 +32,10 @@ static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { Value *Res = Builder.CreateSelect(Equal, Val, Orig); Builder.CreateStore(Res, Ptr); - CXI->replaceAllUsesWith(Orig); + Res = Builder.CreateInsertValue(UndefValue::get(CXI->getType()), Orig, 0); + Res = Builder.CreateInsertValue(Res, Equal, 1); + + CXI->replaceAllUsesWith(Res); CXI->eraseFromParent(); return true; } @@ -42,7 +46,7 @@ static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) { Value *Val = RMWI->getValOperand(); LoadInst *Orig = Builder.CreateLoad(Ptr); - Value *Res = NULL; + Value *Res = nullptr; switch (RMWI->getOperation()) { default: llvm_unreachable("Unexpected RMW operation"); @@ -111,7 +115,9 @@ namespace { LowerAtomic() : BasicBlockPass(ID) { initializeLowerAtomicPass(*PassRegistry::getPassRegistry()); } - bool runOnBasicBlock(BasicBlock &BB) { + bool runOnBasicBlock(BasicBlock &BB) override { + if (skipOptnoneFunction(BB)) + return false; bool Changed = false; for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) { Instruction *Inst = DI++; diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 9912d3d..7c184a4 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -12,27 +12,28 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "memcpyopt" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include <list> using namespace llvm; +#define DEBUG_TYPE "memcpyopt" + STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted"); STATISTIC(NumMemSetInfer, "Number of memsets inferred"); STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); @@ -49,7 +50,7 @@ static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, int64_t Offset = 0; for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) { ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i)); - if (OpC == 0) + if (!OpC) return VariableIdxFound = true; if (OpC->isZero()) continue; // No offset. @@ -75,6 +76,13 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, const DataLayout &TD) { Ptr1 = Ptr1->stripPointerCasts(); Ptr2 = Ptr2->stripPointerCasts(); + + // Handle the trivial case first. + if (Ptr1 == Ptr2) { + Offset = 0; + return true; + } + GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1); GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2); @@ -82,12 +90,12 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, // If one pointer is a GEP and the other isn't, then see if the GEP is a // constant offset from the base, as in "P" and "gep P, 1". - if (GEP1 && GEP2 == 0 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) { + if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) { Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, TD); return !VariableIdxFound; } - if (GEP2 && GEP1 == 0 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) { + if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) { Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, TD); return !VariableIdxFound; } @@ -195,9 +203,9 @@ class MemsetRanges { /// because each element is relatively large and expensive to copy. std::list<MemsetRange> Ranges; typedef std::list<MemsetRange>::iterator range_iterator; - const DataLayout &TD; + const DataLayout &DL; public: - MemsetRanges(const DataLayout &td) : TD(td) {} + MemsetRanges(const DataLayout &DL) : DL(DL) {} typedef std::list<MemsetRange>::const_iterator const_iterator; const_iterator begin() const { return Ranges.begin(); } @@ -212,7 +220,7 @@ public: } void addStore(int64_t OffsetFromFirst, StoreInst *SI) { - int64_t StoreSize = TD.getTypeStoreSize(SI->getOperand(0)->getType()); + int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType()); addRange(OffsetFromFirst, StoreSize, SI->getPointerOperand(), SI->getAlignment(), SI); @@ -305,23 +313,23 @@ namespace { class MemCpyOpt : public FunctionPass { MemoryDependenceAnalysis *MD; TargetLibraryInfo *TLI; - const DataLayout *TD; + const DataLayout *DL; public: static char ID; // Pass identification, replacement for typeid MemCpyOpt() : FunctionPass(ID) { initializeMemCpyOptPass(*PassRegistry::getPassRegistry()); - MD = 0; - TLI = 0; - TD = 0; + MD = nullptr; + TLI = nullptr; + DL = nullptr; } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; private: // This transformation requires dominator postdominator info - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<MemoryDependenceAnalysis>(); AU.addRequired<AliasAnalysis>(); AU.addRequired<TargetLibraryInfo>(); @@ -353,7 +361,7 @@ FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); } INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) @@ -366,13 +374,13 @@ INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", /// attempts to merge them together into a memcpy/memset. Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, Value *StartPtr, Value *ByteVal) { - if (TD == 0) return 0; + if (!DL) return nullptr; // Okay, so we now have a single store that can be splatable. Scan to find // all subsequent stores of the same value to offset from the same pointer. // Join these together into ranges, so we can decide whether contiguous blocks // are stored. - MemsetRanges Ranges(*TD); + MemsetRanges Ranges(*DL); BasicBlock::iterator BI = StartInst; for (++BI; !isa<TerminatorInst>(BI); ++BI) { @@ -396,7 +404,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // Check to see if this store is to a constant offset from the start ptr. int64_t Offset; if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), - Offset, *TD)) + Offset, *DL)) break; Ranges.addStore(Offset, NextStore); @@ -409,7 +417,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // Check to see if this store is to a constant offset from the start ptr. int64_t Offset; - if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *TD)) + if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *DL)) break; Ranges.addMemSet(Offset, MSI); @@ -419,7 +427,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // If we have no ranges, then we just had a single store with nothing that // could be merged in. This is a very common case of course. if (Ranges.empty()) - return 0; + return nullptr; // If we had at least one store that could be merged in, add the starting // store as well. We try to avoid this unless there is at least something @@ -433,7 +441,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // Now that we have full information about ranges, loop over the ranges and // emit memset's for anything big enough to be worthwhile. - Instruction *AMemSet = 0; + Instruction *AMemSet = nullptr; for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end(); I != E; ++I) { const MemsetRange &Range = *I; @@ -441,7 +449,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, if (Range.TheStores.size() == 1) continue; // If it is profitable to lower this range to memset, do so now. - if (!Range.isProfitableToUseMemset(*TD)) + if (!Range.isProfitableToUseMemset(*DL)) continue; // Otherwise, we do want to transform this! Create a new memset. @@ -453,7 +461,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, if (Alignment == 0) { Type *EltType = cast<PointerType>(StartPtr->getType())->getElementType(); - Alignment = TD->getABITypeAlignment(EltType); + Alignment = DL->getABITypeAlignment(EltType); } AMemSet = @@ -484,7 +492,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!SI->isSimple()) return false; - if (TD == 0) return false; + if (!DL) return false; // Detect cases where we're performing call slot forwarding, but // happen to be using a load-store pair to implement it, rather than @@ -493,7 +501,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (LI->isSimple() && LI->hasOneUse() && LI->getParent() == SI->getParent()) { MemDepResult ldep = MD->getDependency(LI); - CallInst *C = 0; + CallInst *C = nullptr; if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst())) C = dyn_cast<CallInst>(ldep.getInst()); @@ -505,7 +513,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { for (BasicBlock::iterator I = --BasicBlock::iterator(SI), E = C; I != E; --I) { if (AA.getModRefInfo(&*I, StoreLoc) != AliasAnalysis::NoModRef) { - C = 0; + C = nullptr; break; } } @@ -514,15 +522,15 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (C) { unsigned storeAlign = SI->getAlignment(); if (!storeAlign) - storeAlign = TD->getABITypeAlignment(SI->getOperand(0)->getType()); + storeAlign = DL->getABITypeAlignment(SI->getOperand(0)->getType()); unsigned loadAlign = LI->getAlignment(); if (!loadAlign) - loadAlign = TD->getABITypeAlignment(LI->getType()); + loadAlign = DL->getABITypeAlignment(LI->getType()); bool changed = performCallSlotOptzn(LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), - TD->getTypeStoreSize(SI->getOperand(0)->getType()), + DL->getTypeStoreSize(SI->getOperand(0)->getType()), std::min(storeAlign, loadAlign), C); if (changed) { MD->removeInstruction(SI); @@ -596,13 +604,13 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, return false; // Check that all of src is copied to dest. - if (TD == 0) return false; + if (!DL) return false; ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize()); if (!srcArraySize) return false; - uint64_t srcSize = TD->getTypeAllocSize(srcAlloca->getAllocatedType()) * + uint64_t srcSize = DL->getTypeAllocSize(srcAlloca->getAllocatedType()) * srcArraySize->getZExtValue(); if (cpyLen < srcSize) @@ -617,7 +625,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, if (!destArraySize) return false; - uint64_t destSize = TD->getTypeAllocSize(A->getAllocatedType()) * + uint64_t destSize = DL->getTypeAllocSize(A->getAllocatedType()) * destArraySize->getZExtValue(); if (destSize < srcSize) @@ -636,7 +644,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, return false; } - uint64_t destSize = TD->getTypeAllocSize(StructTy); + uint64_t destSize = DL->getTypeAllocSize(StructTy); if (destSize < srcSize) return false; } else { @@ -646,7 +654,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // Check that dest points to memory that is at least as aligned as src. unsigned srcAlign = srcAlloca->getAlignment(); if (!srcAlign) - srcAlign = TD->getABITypeAlignment(srcAlloca->getAllocatedType()); + srcAlign = DL->getABITypeAlignment(srcAlloca->getAllocatedType()); bool isDestSufficientlyAligned = srcAlign <= cpyAlign; // If dest is not aligned enough and we can't increase its alignment then // bail out. @@ -657,30 +665,34 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // guarantees that it holds only undefined values when passed in (so the final // memcpy can be dropped), that it is not read or written between the call and // the memcpy, and that writing beyond the end of it is undefined. - SmallVector<User*, 8> srcUseList(srcAlloca->use_begin(), - srcAlloca->use_end()); + SmallVector<User*, 8> srcUseList(srcAlloca->user_begin(), + srcAlloca->user_end()); while (!srcUseList.empty()) { - User *UI = srcUseList.pop_back_val(); + User *U = srcUseList.pop_back_val(); - if (isa<BitCastInst>(UI)) { - for (User::use_iterator I = UI->use_begin(), E = UI->use_end(); - I != E; ++I) - srcUseList.push_back(*I); - } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(UI)) { + if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) { + for (User *UU : U->users()) + srcUseList.push_back(UU); + } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) { if (G->hasAllZeroIndices()) - for (User::use_iterator I = UI->use_begin(), E = UI->use_end(); - I != E; ++I) - srcUseList.push_back(*I); + for (User *UU : U->users()) + srcUseList.push_back(UU); else return false; - } else if (UI != C && UI != cpy) { + } else if (U != C && U != cpy) { return false; } } + // Check that src isn't captured by the called function since the + // transformation can cause aliasing issues in that case. + for (unsigned i = 0, e = CS.arg_size(); i != e; ++i) + if (CS.getArgument(i) == cpySrc && !CS.doesNotCapture(i)) + return false; + // Since we're changing the parameter to the callsite, we need to make sure // that what would be the new parameter dominates the callsite. - DominatorTree &DT = getAnalysis<DominatorTree>(); + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest)) if (!DT.dominates(cpyDestInst, C)) return false; @@ -816,9 +828,8 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, /// circumstances). This allows later passes to remove the first memcpy /// altogether. bool MemCpyOpt::processMemCpy(MemCpyInst *M) { - // We can only optimize statically-sized memcpy's that are non-volatile. - ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); - if (CopySize == 0 || M->isVolatile()) return false; + // We can only optimize non-volatile memcpy's. + if (M->isVolatile()) return false; // If the source and destination of the memcpy are the same, then zap it. if (M->getSource() == M->getDest()) { @@ -832,7 +843,7 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { if (GV->isConstant() && GV->hasDefinitiveInitializer()) if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) { IRBuilder<> Builder(M); - Builder.CreateMemSet(M->getRawDest(), ByteVal, CopySize, + Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(), M->getAlignment(), false); MD->removeInstruction(M); M->eraseFromParent(); @@ -840,9 +851,16 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { return true; } - // The are two possible optimizations we can do for memcpy: + // The optimizations after this point require the memcpy size. + ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); + if (!CopySize) return false; + + // The are three possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. + // c) memcpy from freshly alloca'd space or space that has just started its + // lifetime copies undefined data, and we can therefore eliminate the + // memcpy in favor of the data that was already at the destination. MemDepResult DepInfo = MD->getDependency(M); if (DepInfo.isClobber()) { if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { @@ -862,6 +880,25 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { if (SrcDepInfo.isClobber()) { if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); + } else if (SrcDepInfo.isDef()) { + Instruction *I = SrcDepInfo.getInst(); + bool hasUndefContents = false; + + if (isa<AllocaInst>(I)) { + hasUndefContents = true; + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start) + if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0))) + if (LTSize->getZExtValue() >= CopySize->getZExtValue()) + hasUndefContents = true; + } + + if (hasUndefContents) { + MD->removeInstruction(M); + M->eraseFromParent(); + ++NumMemCpyInstr; + return true; + } } return false; @@ -899,12 +936,12 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { /// processByValArgument - This is called on every byval argument in call sites. bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { - if (TD == 0) return false; + if (!DL) return false; // Find out what feeds this byval argument. Value *ByValArg = CS.getArgument(ArgNo); Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType(); - uint64_t ByValSize = TD->getTypeAllocSize(ByValTy); + uint64_t ByValSize = DL->getTypeAllocSize(ByValTy); MemDepResult DepInfo = MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize), true, CS.getInstruction(), @@ -916,13 +953,13 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { // a memcpy, see if we can byval from the source of the memcpy instead of the // result. MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst()); - if (MDep == 0 || MDep->isVolatile() || + if (!MDep || MDep->isVolatile() || ByValArg->stripPointerCasts() != MDep->getDest()) return false; // The length of the memcpy must be larger or equal to the size of the byval. ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); - if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize) + if (!C1 || C1->getValue().getZExtValue() < ByValSize) return false; // Get the alignment of the byval. If the call doesn't specify the alignment, @@ -933,7 +970,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { // If it is greater than the memcpy, then we check to see if we can force the // source of the memcpy to the alignment we need. If we fail, we bail out. if (MDep->getAlignment() < ByValAlign && - getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, TD) < ByValAlign) + getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, DL) < ByValAlign) return false; // Verify that the copied-from memory doesn't change in between the memcpy and @@ -1007,9 +1044,13 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { // function. // bool MemCpyOpt::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + bool MadeChange = false; MD = &getAnalysis<MemoryDependenceAnalysis>(); - TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = &getAnalysis<TargetLibraryInfo>(); // If we don't have at least memset and memcpy, there is little point of doing @@ -1024,6 +1065,6 @@ bool MemCpyOpt::runOnFunction(Function &F) { MadeChange = true; } - MD = 0; + MD = nullptr; return MadeChange; } diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp new file mode 100644 index 0000000..a7e8024 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -0,0 +1,632 @@ +//===- MergedLoadStoreMotion.cpp - merge and hoist/sink load/stores -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//! \file +//! \brief This pass performs merges of loads and stores on both sides of a +// diamond (hammock). It hoists the loads and sinks the stores. +// +// The algorithm iteratively hoists two loads to the same address out of a +// diamond (hammock) and merges them into a single load in the header. Similar +// it sinks and merges two stores to the tail block (footer). The algorithm +// iterates over the instructions of one side of the diamond and attempts to +// find a matching load/store on the other side. It hoists / sinks when it +// thinks it safe to do so. This optimization helps with eg. hiding load +// latencies, triggering if-conversion, and reducing static code size. +// +//===----------------------------------------------------------------------===// +// +// +// Example: +// Diamond shaped code before merge: +// +// header: +// br %cond, label %if.then, label %if.else +// / \ +// / \ +// / \ +// if.then: if.else: +// %lt = load %addr_l %le = load %addr_l +// <use %lt> <use %le> +// <...> <...> +// store %st, %addr_s store %se, %addr_s +// br label %if.end br label %if.end +// \ / +// \ / +// \ / +// if.end ("footer"): +// <...> +// +// Diamond shaped code after merge: +// +// header: +// %l = load %addr_l +// br %cond, label %if.then, label %if.else +// / \ +// / \ +// / \ +// if.then: if.else: +// <use %l> <use %l> +// <...> <...> +// br label %if.end br label %if.end +// \ / +// \ / +// \ / +// if.end ("footer"): +// %s.sink = phi [%st, if.then], [%se, if.else] +// <...> +// store %s.sink, %addr_s +// <...> +// +// +//===----------------------- TODO -----------------------------------------===// +// +// 1) Generalize to regions other than diamonds +// 2) Be more aggressive merging memory operations +// Note that both changes require register pressure control +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include <vector> +using namespace llvm; + +#define DEBUG_TYPE "mldst-motion" + +//===----------------------------------------------------------------------===// +// MergedLoadStoreMotion Pass +//===----------------------------------------------------------------------===// +static cl::opt<bool> +EnableMLSM("mlsm", cl::desc("Enable motion of merged load and store"), + cl::init(true)); + +namespace { +class MergedLoadStoreMotion : public FunctionPass { + AliasAnalysis *AA; + MemoryDependenceAnalysis *MD; + +public: + static char ID; // Pass identification, replacement for typeid + explicit MergedLoadStoreMotion(void) + : FunctionPass(ID), MD(nullptr), MagicCompileTimeControl(250) { + initializeMergedLoadStoreMotionPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + +private: + // This transformation requires dominator postdominator info + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<MemoryDependenceAnalysis>(); + AU.addRequired<AliasAnalysis>(); + AU.addPreserved<AliasAnalysis>(); + } + + // Helper routines + + /// + /// \brief Remove instruction from parent and update memory dependence + /// analysis. + /// + void removeInstruction(Instruction *Inst); + BasicBlock *getDiamondTail(BasicBlock *BB); + bool isDiamondHead(BasicBlock *BB); + // Routines for hoisting loads + bool isLoadHoistBarrier(Instruction *Inst); + LoadInst *canHoistFromBlock(BasicBlock *BB, LoadInst *LI); + void hoistInstruction(BasicBlock *BB, Instruction *HoistCand, + Instruction *ElseInst); + bool isSafeToHoist(Instruction *I) const; + bool hoistLoad(BasicBlock *BB, LoadInst *HoistCand, LoadInst *ElseInst); + bool mergeLoads(BasicBlock *BB); + // Routines for sinking stores + StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI); + PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1); + bool isStoreSinkBarrier(Instruction *Inst); + bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst); + bool mergeStores(BasicBlock *BB); + // The mergeLoad/Store algorithms could have Size0 * Size1 complexity, + // where Size0 and Size1 are the #instructions on the two sides of + // the diamond. The constant chosen here is arbitrary. Compiler Time + // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl. + const int MagicCompileTimeControl; +}; + +char MergedLoadStoreMotion::ID = 0; +} + +/// +/// \brief createMergedLoadStoreMotionPass - The public interface to this file. +/// +FunctionPass *llvm::createMergedLoadStoreMotionPass() { + return new MergedLoadStoreMotion(); +} + +INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion", + "MergedLoadStoreMotion", false, false) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion", + "MergedLoadStoreMotion", false, false) + +/// +/// \brief Remove instruction from parent and update memory dependence analysis. +/// +void MergedLoadStoreMotion::removeInstruction(Instruction *Inst) { + // Notify the memory dependence analysis. + if (MD) { + MD->removeInstruction(Inst); + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) + MD->invalidateCachedPointerInfo(LI->getPointerOperand()); + if (Inst->getType()->getScalarType()->isPointerTy()) { + MD->invalidateCachedPointerInfo(Inst); + } + } + Inst->eraseFromParent(); +} + +/// +/// \brief Return tail block of a diamond. +/// +BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) { + assert(isDiamondHead(BB) && "Basic block is not head of a diamond"); + BranchInst *BI = (BranchInst *)(BB->getTerminator()); + BasicBlock *Succ0 = BI->getSuccessor(0); + BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0); + return Tail; +} + +/// +/// \brief True when BB is the head of a diamond (hammock) +/// +bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) { + if (!BB) + return false; + if (!isa<BranchInst>(BB->getTerminator())) + return false; + if (BB->getTerminator()->getNumSuccessors() != 2) + return false; + + BranchInst *BI = (BranchInst *)(BB->getTerminator()); + BasicBlock *Succ0 = BI->getSuccessor(0); + BasicBlock *Succ1 = BI->getSuccessor(1); + + if (!Succ0->getSinglePredecessor() || + Succ0->getTerminator()->getNumSuccessors() != 1) + return false; + if (!Succ1->getSinglePredecessor() || + Succ1->getTerminator()->getNumSuccessors() != 1) + return false; + + BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0); + // Ignore triangles. + if (Succ1->getTerminator()->getSuccessor(0) != Tail) + return false; + return true; +} + +/// +/// \brief True when instruction is a hoist barrier for a load +/// +/// Whenever an instruction could possibly modify the value +/// being loaded or protect against the load from happening +/// it is considered a hoist barrier. +/// +bool MergedLoadStoreMotion::isLoadHoistBarrier(Instruction *Inst) { + // FIXME: A call with no side effects should not be a barrier. + // Aren't all such calls covered by mayHaveSideEffects() below? + // Then this check can be removed. + if (isa<CallInst>(Inst)) + return true; + if (isa<TerminatorInst>(Inst)) + return true; + // FIXME: Conservatively let a store instruction block the load. + // Use alias analysis instead. + if (isa<StoreInst>(Inst)) + return true; + // Note: mayHaveSideEffects covers all instructions that could + // trigger a change to state. Eg. in-flight stores have to be executed + // before ordered loads or fences, calls could invoke functions that store + // data to memory etc. + if (Inst->mayHaveSideEffects()) { + return true; + } + DEBUG(dbgs() << "No Hoist Barrier\n"); + return false; +} + +/// +/// \brief Decide if a load can be hoisted +/// +/// When there is a load in \p BB to the same address as \p LI +/// and it can be hoisted from \p BB, return that load. +/// Otherwise return Null. +/// +LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB, + LoadInst *LI) { + LoadInst *I = nullptr; + assert(isa<LoadInst>(LI)); + if (LI->isUsedOutsideOfBlock(LI->getParent())) + return nullptr; + + for (BasicBlock::iterator BBI = BB->begin(), BBE = BB->end(); BBI != BBE; + ++BBI) { + Instruction *Inst = BBI; + + // Only merge and hoist loads when their result in used only in BB + if (isLoadHoistBarrier(Inst)) + break; + if (!isa<LoadInst>(Inst)) + continue; + if (Inst->isUsedOutsideOfBlock(Inst->getParent())) + continue; + + AliasAnalysis::Location LocLI = AA->getLocation(LI); + AliasAnalysis::Location LocInst = AA->getLocation((LoadInst *)Inst); + if (AA->isMustAlias(LocLI, LocInst) && LI->getType() == Inst->getType()) { + I = (LoadInst *)Inst; + break; + } + } + return I; +} + +/// +/// \brief Merge two equivalent instructions \p HoistCand and \p ElseInst into +/// \p BB +/// +/// BB is the head of a diamond +/// +void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB, + Instruction *HoistCand, + Instruction *ElseInst) { + DEBUG(dbgs() << " Hoist Instruction into BB \n"; BB->dump(); + dbgs() << "Instruction Left\n"; HoistCand->dump(); dbgs() << "\n"; + dbgs() << "Instruction Right\n"; ElseInst->dump(); dbgs() << "\n"); + // Hoist the instruction. + assert(HoistCand->getParent() != BB); + + // Intersect optional metadata. + HoistCand->intersectOptionalDataWith(ElseInst); + HoistCand->dropUnknownMetadata(); + + // Prepend point for instruction insert + Instruction *HoistPt = BB->getTerminator(); + + // Merged instruction + Instruction *HoistedInst = HoistCand->clone(); + + // Notify AA of the new value. + if (isa<LoadInst>(HoistCand)) + AA->copyValue(HoistCand, HoistedInst); + + // Hoist instruction. + HoistedInst->insertBefore(HoistPt); + + HoistCand->replaceAllUsesWith(HoistedInst); + removeInstruction(HoistCand); + // Replace the else block instruction. + ElseInst->replaceAllUsesWith(HoistedInst); + removeInstruction(ElseInst); +} + +/// +/// \brief Return true if no operand of \p I is defined in I's parent block +/// +bool MergedLoadStoreMotion::isSafeToHoist(Instruction *I) const { + BasicBlock *Parent = I->getParent(); + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + Instruction *Instr = dyn_cast<Instruction>(I->getOperand(i)); + if (Instr && Instr->getParent() == Parent) + return false; + } + return true; +} + +/// +/// \brief Merge two equivalent loads and GEPs and hoist into diamond head +/// +bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0, + LoadInst *L1) { + // Only one definition? + Instruction *A0 = dyn_cast<Instruction>(L0->getPointerOperand()); + Instruction *A1 = dyn_cast<Instruction>(L1->getPointerOperand()); + if (A0 && A1 && A0->isIdenticalTo(A1) && isSafeToHoist(A0) && + A0->hasOneUse() && (A0->getParent() == L0->getParent()) && + A1->hasOneUse() && (A1->getParent() == L1->getParent()) && + isa<GetElementPtrInst>(A0)) { + DEBUG(dbgs() << "Hoist Instruction into BB \n"; BB->dump(); + dbgs() << "Instruction Left\n"; L0->dump(); dbgs() << "\n"; + dbgs() << "Instruction Right\n"; L1->dump(); dbgs() << "\n"); + hoistInstruction(BB, A0, A1); + hoistInstruction(BB, L0, L1); + return true; + } else + return false; +} + +/// +/// \brief Try to hoist two loads to same address into diamond header +/// +/// Starting from a diamond head block, iterate over the instructions in one +/// successor block and try to match a load in the second successor. +/// +bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { + bool MergedLoads = false; + assert(isDiamondHead(BB)); + BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); + BasicBlock *Succ0 = BI->getSuccessor(0); + BasicBlock *Succ1 = BI->getSuccessor(1); + // #Instructions in Succ1 for Compile Time Control + int Size1 = Succ1->size(); + int NLoads = 0; + for (BasicBlock::iterator BBI = Succ0->begin(), BBE = Succ0->end(); + BBI != BBE;) { + + Instruction *I = BBI; + ++BBI; + if (isLoadHoistBarrier(I)) + break; + + // Only move non-simple (atomic, volatile) loads. + if (!isa<LoadInst>(I)) + continue; + + LoadInst *L0 = (LoadInst *)I; + if (!L0->isSimple()) + continue; + + ++NLoads; + if (NLoads * Size1 >= MagicCompileTimeControl) + break; + if (LoadInst *L1 = canHoistFromBlock(Succ1, L0)) { + bool Res = hoistLoad(BB, L0, L1); + MergedLoads |= Res; + // Don't attempt to hoist above loads that had not been hoisted. + if (!Res) + break; + } + } + return MergedLoads; +} + +/// +/// \brief True when instruction is sink barrier for a store +/// +bool MergedLoadStoreMotion::isStoreSinkBarrier(Instruction *Inst) { + // FIXME: Conservatively let a load instruction block the store. + // Use alias analysis instead. + if (isa<LoadInst>(Inst)) + return true; + if (isa<CallInst>(Inst)) + return true; + if (isa<TerminatorInst>(Inst) && !isa<BranchInst>(Inst)) + return true; + // Note: mayHaveSideEffects covers all instructions that could + // trigger a change to state. Eg. in-flight stores have to be executed + // before ordered loads or fences, calls could invoke functions that store + // data to memory etc. + if (!isa<StoreInst>(Inst) && Inst->mayHaveSideEffects()) { + return true; + } + DEBUG(dbgs() << "No Sink Barrier\n"); + return false; +} + +/// +/// \brief Check if \p BB contains a store to the same address as \p SI +/// +/// \return The store in \p when it is safe to sink. Otherwise return Null. +/// +StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB, + StoreInst *SI) { + StoreInst *I = 0; + DEBUG(dbgs() << "can Sink? : "; SI->dump(); dbgs() << "\n"); + for (BasicBlock::reverse_iterator RBI = BB->rbegin(), RBE = BB->rend(); + RBI != RBE; ++RBI) { + Instruction *Inst = &*RBI; + + // Only move loads if they are used in the block. + if (isStoreSinkBarrier(Inst)) + break; + if (isa<StoreInst>(Inst)) { + AliasAnalysis::Location LocSI = AA->getLocation(SI); + AliasAnalysis::Location LocInst = AA->getLocation((StoreInst *)Inst); + if (AA->isMustAlias(LocSI, LocInst)) { + I = (StoreInst *)Inst; + break; + } + } + } + return I; +} + +/// +/// \brief Create a PHI node in BB for the operands of S0 and S1 +/// +PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0, + StoreInst *S1) { + // Create a phi if the values mismatch. + PHINode *NewPN = 0; + Value *Opd1 = S0->getValueOperand(); + Value *Opd2 = S1->getValueOperand(); + if (Opd1 != Opd2) { + NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink", + BB->begin()); + NewPN->addIncoming(Opd1, S0->getParent()); + NewPN->addIncoming(Opd2, S1->getParent()); + if (NewPN->getType()->getScalarType()->isPointerTy()) { + // Notify AA of the new value. + AA->copyValue(Opd1, NewPN); + AA->copyValue(Opd2, NewPN); + // AA needs to be informed when a PHI-use of the pointer value is added + for (unsigned I = 0, E = NewPN->getNumIncomingValues(); I != E; ++I) { + unsigned J = PHINode::getOperandNumForIncomingValue(I); + AA->addEscapingUse(NewPN->getOperandUse(J)); + } + if (MD) + MD->invalidateCachedPointerInfo(NewPN); + } + } + return NewPN; +} + +/// +/// \brief Merge two stores to same address and sink into \p BB +/// +/// Also sinks GEP instruction computing the store address +/// +bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0, + StoreInst *S1) { + // Only one definition? + Instruction *A0 = dyn_cast<Instruction>(S0->getPointerOperand()); + Instruction *A1 = dyn_cast<Instruction>(S1->getPointerOperand()); + if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() && + (A0->getParent() == S0->getParent()) && A1->hasOneUse() && + (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) { + DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump(); + dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n"; + dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n"); + // Hoist the instruction. + BasicBlock::iterator InsertPt = BB->getFirstInsertionPt(); + // Intersect optional metadata. + S0->intersectOptionalDataWith(S1); + S0->dropUnknownMetadata(); + + // Create the new store to be inserted at the join point. + StoreInst *SNew = (StoreInst *)(S0->clone()); + Instruction *ANew = A0->clone(); + AA->copyValue(S0, SNew); + SNew->insertBefore(InsertPt); + ANew->insertBefore(SNew); + + assert(S0->getParent() == A0->getParent()); + assert(S1->getParent() == A1->getParent()); + + PHINode *NewPN = getPHIOperand(BB, S0, S1); + // New PHI operand? Use it. + if (NewPN) + SNew->setOperand(0, NewPN); + removeInstruction(S0); + removeInstruction(S1); + A0->replaceAllUsesWith(ANew); + removeInstruction(A0); + A1->replaceAllUsesWith(ANew); + removeInstruction(A1); + return true; + } + return false; +} + +/// +/// \brief True when two stores are equivalent and can sink into the footer +/// +/// Starting from a diamond tail block, iterate over the instructions in one +/// predecessor block and try to match a store in the second predecessor. +/// +bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { + + bool MergedStores = false; + assert(T && "Footer of a diamond cannot be empty"); + + pred_iterator PI = pred_begin(T), E = pred_end(T); + assert(PI != E); + BasicBlock *Pred0 = *PI; + ++PI; + BasicBlock *Pred1 = *PI; + ++PI; + // tail block of a diamond/hammock? + if (Pred0 == Pred1) + return false; // No. + if (PI != E) + return false; // No. More than 2 predecessors. + + // #Instructions in Succ1 for Compile Time Control + int Size1 = Pred1->size(); + int NStores = 0; + + for (BasicBlock::reverse_iterator RBI = Pred0->rbegin(), RBE = Pred0->rend(); + RBI != RBE;) { + + Instruction *I = &*RBI; + ++RBI; + if (isStoreSinkBarrier(I)) + break; + // Sink move non-simple (atomic, volatile) stores + if (!isa<StoreInst>(I)) + continue; + StoreInst *S0 = (StoreInst *)I; + if (!S0->isSimple()) + continue; + + ++NStores; + if (NStores * Size1 >= MagicCompileTimeControl) + break; + if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) { + bool Res = sinkStore(T, S0, S1); + MergedStores |= Res; + // Don't attempt to sink below stores that had to stick around + // But after removal of a store and some of its feeding + // instruction search again from the beginning since the iterator + // is likely stale at this point. + if (!Res) + break; + else { + RBI = Pred0->rbegin(); + RBE = Pred0->rend(); + DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump()); + } + } + } + return MergedStores; +} +/// +/// \brief Run the transformation for each function +/// +bool MergedLoadStoreMotion::runOnFunction(Function &F) { + MD = &getAnalysis<MemoryDependenceAnalysis>(); + AA = &getAnalysis<AliasAnalysis>(); + + bool Changed = false; + if (!EnableMLSM) + return false; + DEBUG(dbgs() << "Instruction Merger\n"); + + // Merge unconditional branches, allowing PRE to catch more + // optimization opportunities. + for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) { + BasicBlock *BB = FI++; + + // Hoist equivalent loads and sink stores + // outside diamonds when possible + // Run outside core GVN + if (isDiamondHead(BB)) { + Changed |= mergeLoads(BB); + Changed |= mergeStores(getDiamondTail(BB)); + } + } + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 15cee44..7cce89e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -13,7 +13,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "partially-inline-libcalls" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Intrinsics.h" @@ -25,6 +24,8 @@ using namespace llvm; +#define DEBUG_TYPE "partially-inline-libcalls" + namespace { class PartiallyInlineLibCalls : public FunctionPass { public: @@ -35,8 +36,8 @@ namespace { initializePartiallyInlineLibCallsPass(*PassRegistry::getPassRegistry()); } - virtual void getAnalysisUsage(AnalysisUsage &AU) const; - virtual bool runOnFunction(Function &F); + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &F) override; private: /// Optimize calls to sqrt. diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp index 328a9c5..ea2cf7c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -20,29 +20,29 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "reassociate" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Assembly/Writer.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" -#include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; +#define DEBUG_TYPE "reassociate" + STATISTIC(NumChanged, "Number of insts reassociated"); STATISTIC(NumAnnihil, "Number of expr tree annihilated"); STATISTIC(NumFactor , "Number of multiplies factored"); @@ -67,7 +67,7 @@ static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) { << *Ops[0].Op->getType() << '\t'; for (unsigned i = 0, e = Ops.size(); i != e; ++i) { dbgs() << "[ "; - WriteAsOperand(dbgs(), Ops[i].Op, false, M); + Ops[i].Op->printAsOperand(dbgs(), false, M); dbgs() << ", #" << Ops[i].Rank << "] "; } } @@ -123,14 +123,14 @@ namespace { public: XorOpnd(Value *V); - bool isInvalid() const { return SymbolicPart == 0; } + bool isInvalid() const { return SymbolicPart == nullptr; } bool isOrExpr() const { return isOr; } Value *getValue() const { return OrigVal; } Value *getSymbolicPart() const { return SymbolicPart; } unsigned getSymbolicRank() const { return SymbolicRank; } const APInt &getConstPart() const { return ConstPart; } - void Invalidate() { SymbolicPart = OrigVal = 0; } + void Invalidate() { SymbolicPart = OrigVal = nullptr; } void setSymbolicRank(unsigned R) { SymbolicRank = R; } // Sort the XorOpnd-Pointer in ascending order of symbolic-value-rank. @@ -168,9 +168,9 @@ namespace { initializeReassociatePass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } private: @@ -237,7 +237,7 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { if (V->hasOneUse() && isa<Instruction>(V) && cast<Instruction>(V)->getOpcode() == Opcode) return cast<BinaryOperator>(V); - return 0; + return nullptr; } static bool isUnmovableInstruction(Instruction *I) { @@ -285,7 +285,7 @@ void Reassociate::BuildRankMap(Function &F) { unsigned Reassociate::getRank(Value *V) { Instruction *I = dyn_cast<Instruction>(V); - if (I == 0) { + if (!I) { if (isa<Argument>(V)) return ValueRankMap[V]; // Function argument. return 0; // Otherwise it's a global or constant, rank 0. } @@ -706,7 +706,7 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, // ExpressionChanged - Non-null if the rewritten expression differs from the // original in some non-trivial way, requiring the clearing of optional flags. // Flags are cleared from the operator in ExpressionChanged up to I inclusive. - BinaryOperator *ExpressionChanged = 0; + BinaryOperator *ExpressionChanged = nullptr; for (unsigned i = 0; ; ++i) { // The last operation (which comes earliest in the IR) is special as both // operands will come from Ops, rather than just one with the other being @@ -821,7 +821,7 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, if (ExpressionChanged == I) break; ExpressionChanged->moveBefore(I); - ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->use_begin()); + ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin()); } while (1); // Throw away any left over nodes from the original expression. @@ -863,8 +863,7 @@ static Value *NegateValue(Value *V, Instruction *BI) { // Okay, we need to materialize a negated version of V with an instruction. // Scan the use lists of V to see if we have one already. - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){ - User *U = *UI; + for (User *U : V->users()) { if (!BinaryOperator::isNeg(U)) continue; // We found one! Now we have to make sure that the definition dominates @@ -914,8 +913,8 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) { isReassociableOp(Sub->getOperand(1), Instruction::Sub)) return true; if (Sub->hasOneUse() && - (isReassociableOp(Sub->use_back(), Instruction::Add) || - isReassociableOp(Sub->use_back(), Instruction::Sub))) + (isReassociableOp(Sub->user_back(), Instruction::Add) || + isReassociableOp(Sub->user_back(), Instruction::Sub))) return true; return false; @@ -997,7 +996,7 @@ static Value *EmitAddTreeOfValues(Instruction *I, /// remove Factor from the tree and return the new tree. Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { BinaryOperator *BO = isReassociableOp(V, Instruction::Mul); - if (!BO) return 0; + if (!BO) return nullptr; SmallVector<RepeatedValue, 8> Tree; MadeChange |= LinearizeExprTree(BO, Tree); @@ -1031,7 +1030,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { if (!FoundFactor) { // Make sure to restore the operands to the expression tree. RewriteExprTree(BO, Factors); - return 0; + return nullptr; } BasicBlock::iterator InsertPt = BO; ++InsertPt; @@ -1116,7 +1115,7 @@ static Value *OptimizeAndOrXor(unsigned Opcode, ++NumAnnihil; } } - return 0; + return nullptr; } /// Helper funciton of CombineXorOpnd(). It creates a bitwise-and @@ -1137,7 +1136,7 @@ static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, } return Opnd; } - return 0; + return nullptr; } // Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd" @@ -1263,7 +1262,7 @@ Value *Reassociate::OptimizeXor(Instruction *I, return V; if (Ops.size() == 1) - return 0; + return nullptr; SmallVector<XorOpnd, 8> Opnds; SmallVector<XorOpnd*, 8> OpndPtrs; @@ -1293,10 +1292,10 @@ Value *Reassociate::OptimizeXor(Instruction *I, // the same symbolic value cluster together. For instance, the input operand // sequence ("x | 123", "y & 456", "x & 789") will be sorted into: // ("x | 123", "x & 789", "y & 456"). - std::sort(OpndPtrs.begin(), OpndPtrs.end(), XorOpnd::PtrSortFunctor()); + std::stable_sort(OpndPtrs.begin(), OpndPtrs.end(), XorOpnd::PtrSortFunctor()); // Step 3: Combine adjacent operands - XorOpnd *PrevOpnd = 0; + XorOpnd *PrevOpnd = nullptr; bool Changed = false; for (unsigned i = 0, e = Opnds.size(); i < e; i++) { XorOpnd *CurrOpnd = OpndPtrs[i]; @@ -1330,7 +1329,7 @@ Value *Reassociate::OptimizeXor(Instruction *I, PrevOpnd = CurrOpnd; } else { CurrOpnd->Invalidate(); - PrevOpnd = 0; + PrevOpnd = nullptr; } Changed = true; } @@ -1360,7 +1359,7 @@ Value *Reassociate::OptimizeXor(Instruction *I, } } - return 0; + return nullptr; } /// OptimizeAdd - Optimize a series of operands to an 'add' instruction. This @@ -1369,11 +1368,10 @@ Value *Reassociate::OptimizeXor(Instruction *I, Value *Reassociate::OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops) { // Scan the operand lists looking for X and -X pairs. If we find any, we - // can simplify the expression. X+-X == 0. While we're at it, scan for any + // can simplify expressions like X+-X == 0 and X+~X ==-1. While we're at it, + // scan for any // duplicates. We want to canonicalize Y+Y+Y+Z -> 3*Y+Z. - // - // TODO: We could handle "X + ~X" -> "-1" if we wanted, since "-X = ~X+1". - // + for (unsigned i = 0, e = Ops.size(); i != e; ++i) { Value *TheOp = Ops[i].Op; // Check to see if we've seen this operand before. If so, we factor all @@ -1413,19 +1411,28 @@ Value *Reassociate::OptimizeAdd(Instruction *I, continue; } - // Check for X and -X in the operand list. - if (!BinaryOperator::isNeg(TheOp)) + // Check for X and -X or X and ~X in the operand list. + if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isNot(TheOp)) continue; - Value *X = BinaryOperator::getNegArgument(TheOp); + Value *X = nullptr; + if (BinaryOperator::isNeg(TheOp)) + X = BinaryOperator::getNegArgument(TheOp); + else if (BinaryOperator::isNot(TheOp)) + X = BinaryOperator::getNotArgument(TheOp); + unsigned FoundX = FindInOperandList(Ops, i, X); if (FoundX == i) continue; // Remove X and -X from the operand list. - if (Ops.size() == 2) + if (Ops.size() == 2 && BinaryOperator::isNeg(TheOp)) return Constant::getNullValue(X->getType()); + // Remove X and ~X from the operand list. + if (Ops.size() == 2 && BinaryOperator::isNot(TheOp)) + return Constant::getAllOnesValue(X->getType()); + Ops.erase(Ops.begin()+i); if (i < FoundX) --FoundX; @@ -1435,6 +1442,13 @@ Value *Reassociate::OptimizeAdd(Instruction *I, ++NumAnnihil; --i; // Revisit element. e -= 2; // Removed two elements. + + // if X and ~X we append -1 to the operand list. + if (BinaryOperator::isNot(TheOp)) { + Value *V = Constant::getAllOnesValue(X->getType()); + Ops.insert(Ops.end(), ValueEntry(getRank(V), V)); + e += 1; + } } // Scan the operand list, checking to see if there are any common factors @@ -1447,7 +1461,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I, // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4) // where they are actually the same multiply. unsigned MaxOcc = 0; - Value *MaxOccVal = 0; + Value *MaxOccVal = nullptr; for (unsigned i = 0, e = Ops.size(); i != e; ++i) { BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul); if (!BOp) @@ -1545,20 +1559,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I, Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2)); } - return 0; -} - -namespace { - /// \brief Predicate tests whether a ValueEntry's op is in a map. - struct IsValueInMap { - const DenseMap<Value *, unsigned> ⤅ - - IsValueInMap(const DenseMap<Value *, unsigned> &Map) : Map(Map) {} - - bool operator()(const ValueEntry &Entry) { - return Map.find(Entry.Op) != Map.end(); - } - }; + return nullptr; } /// \brief Build up a vector of value/power pairs factoring a product. @@ -1619,7 +1620,7 @@ bool Reassociate::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, // below our mininum of '4'. assert(FactorPowerSum >= 4); - std::sort(Factors.begin(), Factors.end(), Factor::PowerDescendingSorter()); + std::stable_sort(Factors.begin(), Factors.end(), Factor::PowerDescendingSorter()); return true; } @@ -1703,14 +1704,14 @@ Value *Reassociate::OptimizeMul(BinaryOperator *I, // We can only optimize the multiplies when there is a chain of more than // three, such that a balanced tree might require fewer total multiplies. if (Ops.size() < 4) - return 0; + return nullptr; // Try to turn linear trees of multiplies without other uses of the // intermediate stages into minimal multiply DAGs with perfect sub-expression // re-use. SmallVector<Factor, 4> Factors; if (!collectMultiplyFactors(Ops, Factors)) - return 0; // All distinct factors, so nothing left for us to do. + return nullptr; // All distinct factors, so nothing left for us to do. IRBuilder<> Builder(I); Value *V = buildMinimalMultiplyDAG(Builder, Factors); @@ -1719,14 +1720,14 @@ Value *Reassociate::OptimizeMul(BinaryOperator *I, ValueEntry NewEntry = ValueEntry(getRank(V), V); Ops.insert(std::lower_bound(Ops.begin(), Ops.end(), NewEntry), NewEntry); - return 0; + return nullptr; } Value *Reassociate::OptimizeExpression(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops) { // Now that we have the linearized expression tree, try to optimize it. // Start by folding any constants that we found. - Constant *Cst = 0; + Constant *Cst = nullptr; unsigned Opcode = I->getOpcode(); while (!Ops.empty() && isa<Constant>(Ops.back().Op)) { Constant *C = cast<Constant>(Ops.pop_back_val().Op); @@ -1776,7 +1777,7 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I, if (Ops.size() != NumOps) return OptimizeExpression(I, Ops); - return 0; + return nullptr; } /// EraseInst - Zap the given instruction, adding interesting operands to the @@ -1795,9 +1796,9 @@ void Reassociate::EraseInst(Instruction *I) { // If this is a node in an expression tree, climb to the expression root // and add that since that's where optimization actually happens. unsigned Opcode = Op->getOpcode(); - while (Op->hasOneUse() && Op->use_back()->getOpcode() == Opcode && + while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode && Visited.insert(Op)) - Op = Op->use_back(); + Op = Op->user_back(); RedoInsts.insert(Op); } } @@ -1815,8 +1816,8 @@ void Reassociate::OptimizeInst(Instruction *I) { // is used by a reassociable multiply or add, turn into a multiply. if (isReassociableOp(I->getOperand(0), Instruction::Mul) || (I->hasOneUse() && - (isReassociableOp(I->use_back(), Instruction::Mul) || - isReassociableOp(I->use_back(), Instruction::Add)))) { + (isReassociableOp(I->user_back(), Instruction::Mul) || + isReassociableOp(I->user_back(), Instruction::Add)))) { Instruction *NI = ConvertShiftToMul(I); RedoInsts.insert(I); MadeChange = true; @@ -1869,7 +1870,7 @@ void Reassociate::OptimizeInst(Instruction *I) { // and if this is not an inner node of a multiply tree. if (isReassociableOp(I->getOperand(1), Instruction::Mul) && (!I->hasOneUse() || - !isReassociableOp(I->use_back(), Instruction::Mul))) { + !isReassociableOp(I->user_back(), Instruction::Mul))) { Instruction *NI = LowerNegateToMultiply(I); RedoInsts.insert(I); MadeChange = true; @@ -1885,13 +1886,13 @@ void Reassociate::OptimizeInst(Instruction *I) { // If this is an interior node of a reassociable tree, ignore it until we // get to the root of the tree, to avoid N^2 analysis. unsigned Opcode = BO->getOpcode(); - if (BO->hasOneUse() && BO->use_back()->getOpcode() == Opcode) + if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) return; // If this is an add tree that is used by a sub instruction, ignore it // until we process the subtract. if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add && - cast<Instruction>(BO->use_back())->getOpcode() == Instruction::Sub) + cast<Instruction>(BO->user_back())->getOpcode() == Instruction::Sub) return; ReassociateExpression(BO); @@ -1943,7 +1944,7 @@ void Reassociate::ReassociateExpression(BinaryOperator *I) { // In this case we reassociate to put the negation on the outside so that we // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y if (I->getOpcode() == Instruction::Mul && I->hasOneUse() && - cast<Instruction>(I->use_back())->getOpcode() == Instruction::Add && + cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add && isa<ConstantInt>(Ops.back().Op) && cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) { ValueEntry Tmp = Ops.pop_back_val(); @@ -1972,6 +1973,9 @@ void Reassociate::ReassociateExpression(BinaryOperator *I) { } bool Reassociate::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + // Calculate the rank map for F BuildRankMap(F); diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp index 07f540a..b6023e2 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -16,20 +16,21 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "reg2mem" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" -#include "llvm/Support/CFG.h" #include "llvm/Transforms/Utils/Local.h" #include <list> using namespace llvm; +#define DEBUG_TYPE "reg2mem" + STATISTIC(NumRegsDemoted, "Number of registers demoted"); STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted"); @@ -40,23 +41,22 @@ namespace { initializeRegToMemPass(*PassRegistry::getPassRegistry()); } - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequiredID(BreakCriticalEdgesID); AU.addPreservedID(BreakCriticalEdgesID); } - bool valueEscapes(const Instruction *Inst) const { - const BasicBlock *BB = Inst->getParent(); - for (Value::const_use_iterator UI = Inst->use_begin(),E = Inst->use_end(); - UI != E; ++UI) { - const Instruction *I = cast<Instruction>(*UI); - if (I->getParent() != BB || isa<PHINode>(I)) + bool valueEscapes(const Instruction *Inst) const { + const BasicBlock *BB = Inst->getParent(); + for (const User *U : Inst->users()) { + const Instruction *UI = cast<Instruction>(U); + if (UI->getParent() != BB || isa<PHINode>(UI)) return true; } return false; } - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; }; } diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp index 4364720..90c3520 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -17,7 +17,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "sccp" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" @@ -26,13 +25,13 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" -#include "llvm/InstVisitor.h" #include "llvm/Pass.h" -#include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -42,6 +41,8 @@ #include <algorithm> using namespace llvm; +#define DEBUG_TYPE "sccp" + STATISTIC(NumInstRemoved, "Number of instructions removed"); STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable"); @@ -81,7 +82,7 @@ class LatticeVal { } public: - LatticeVal() : Val(0, undefined) {} + LatticeVal() : Val(nullptr, undefined) {} bool isUndefined() const { return getLatticeValue() == undefined; } bool isConstant() const { @@ -133,7 +134,7 @@ public: ConstantInt *getConstantInt() const { if (isConstant()) return dyn_cast<ConstantInt>(getConstant()); - return 0; + return nullptr; } void markForcedConstant(Constant *V) { @@ -153,7 +154,7 @@ namespace { /// Constant Propagation. /// class SCCPSolver : public InstVisitor<SCCPSolver> { - const DataLayout *TD; + const DataLayout *DL; const TargetLibraryInfo *TLI; SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable. DenseMap<Value*, LatticeVal> ValueState; // The state each value is in. @@ -205,8 +206,8 @@ class SCCPSolver : public InstVisitor<SCCPSolver> { typedef std::pair<BasicBlock*, BasicBlock*> Edge; DenseSet<Edge> KnownFeasibleEdges; public: - SCCPSolver(const DataLayout *td, const TargetLibraryInfo *tli) - : TD(td), TLI(tli) {} + SCCPSolver(const DataLayout *DL, const TargetLibraryInfo *tli) + : DL(DL), TLI(tli) {} /// MarkBlockExecutable - This method can be used by clients to mark all of /// the blocks that are known to be intrinsically live in the processed unit. @@ -403,7 +404,7 @@ private: if (Constant *C = dyn_cast<Constant>(V)) { Constant *Elt = C->getAggregateElement(i); - if (Elt == 0) + if (!Elt) LV.markOverdefined(); // Unknown sort of constant. else if (isa<UndefValue>(Elt)) ; // Undef values remain undefined. @@ -491,10 +492,11 @@ private: } void visitCallSite (CallSite CS); void visitResumeInst (TerminatorInst &I) { /*returns void*/ } - void visitUnwindInst (TerminatorInst &I) { /*returns void*/ } void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ } void visitFenceInst (FenceInst &I) { /*returns void*/ } - void visitAtomicCmpXchgInst (AtomicCmpXchgInst &I) { markOverdefined(&I); } + void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) { + markAnythingOverdefined(&I); + } void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); } void visitAllocaInst (Instruction &I) { markOverdefined(&I); } void visitVAArgInst (Instruction &I) { markAnythingOverdefined(&I); } @@ -523,7 +525,7 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, LatticeVal BCValue = getValueState(BI->getCondition()); ConstantInt *CI = BCValue.getConstantInt(); - if (CI == 0) { + if (!CI) { // Overdefined condition variables, and branches on unfoldable constant // conditions, mean the branch could go either way. if (!BCValue.isUndefined()) @@ -550,7 +552,7 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, LatticeVal SCValue = getValueState(SI->getCondition()); ConstantInt *CI = SCValue.getConstantInt(); - if (CI == 0) { // Overdefined or undefined condition? + if (!CI) { // Overdefined or undefined condition? // All destinations are executable! if (!SCValue.isUndefined()) Succs.assign(TI.getNumSuccessors(), true); @@ -595,7 +597,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { // Overdefined condition variables mean the branch could go either way, // undef conditions mean that neither edge is feasible yet. ConstantInt *CI = BCValue.getConstantInt(); - if (CI == 0) + if (!CI) return !BCValue.isUndefined(); // Constant condition variables mean the branch can only go a single way. @@ -613,7 +615,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { LatticeVal SCValue = getValueState(SI->getCondition()); ConstantInt *CI = SCValue.getConstantInt(); - if (CI == 0) + if (!CI) return !SCValue.isUndefined(); return SI->findCaseValue(CI).getCaseSuccessor() == To; @@ -627,7 +629,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { #ifndef NDEBUG dbgs() << "Unknown terminator instruction: " << *TI << '\n'; #endif - llvm_unreachable(0); + llvm_unreachable(nullptr); } // visit Implementations - Something changed in this instruction, either an @@ -668,7 +670,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) { // constant. If they are constant and don't agree, the PHI is overdefined. // If there are no executable operands, the PHI remains undefined. // - Constant *OperandVal = 0; + Constant *OperandVal = nullptr; for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { LatticeVal IV = getValueState(PN.getIncomingValue(i)); if (IV.isUndefined()) continue; // Doesn't influence PHI node. @@ -679,7 +681,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) { if (IV.isOverdefined()) // PHI node becomes overdefined! return markOverdefined(&PN); - if (OperandVal == 0) { // Grab the first value. + if (!OperandVal) { // Grab the first value. OperandVal = IV.getConstant(); continue; } @@ -775,7 +777,7 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) { void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) { StructType *STy = dyn_cast<StructType>(IVI.getType()); - if (STy == 0) + if (!STy) return markOverdefined(&IVI); // If this has more than one index, we can't handle it, drive all results to @@ -863,7 +865,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { // If this is an AND or OR with 0 or -1, it doesn't matter that the other // operand is overdefined. if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Or) { - LatticeVal *NonOverdefVal = 0; + LatticeVal *NonOverdefVal = nullptr; if (!V1State.isOverdefined()) NonOverdefVal = &V1State; else if (!V2State.isOverdefined()) @@ -1067,7 +1069,7 @@ void SCCPSolver::visitLoadInst(LoadInst &I) { } // Transform load from a constant into a constant if possible. - if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, TD)) + if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, DL)) return markConstant(IV, &I, C); // Otherwise we cannot say for certain what value this load will produce. @@ -1082,7 +1084,7 @@ void SCCPSolver::visitCallSite(CallSite CS) { // The common case is that we aren't tracking the callee, either because we // are not doing interprocedural analysis or the callee is indirect, or is // external. Handle these cases first. - if (F == 0 || F->isDeclaration()) { + if (!F || F->isDeclaration()) { CallOverdefined: // Void return and not tracking callee, just bail. if (I->getType()->isVoidTy()) return; @@ -1181,10 +1183,9 @@ void SCCPSolver::Solve() { // since all of its users will have already been marked as overdefined // Update all of the users of this instruction's value. // - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); - UI != E; ++UI) - if (Instruction *I = dyn_cast<Instruction>(*UI)) - OperandChangedState(I); + for (User *U : I->users()) + if (Instruction *UI = dyn_cast<Instruction>(U)) + OperandChangedState(UI); } // Process the instruction work list. @@ -1201,10 +1202,9 @@ void SCCPSolver::Solve() { // Update all of the users of this instruction's value. // if (I->getType()->isStructTy() || !getValueState(I).isOverdefined()) - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); - UI != E; ++UI) - if (Instruction *I = dyn_cast<Instruction>(*UI)) - OperandChangedState(I); + for (User *U : I->users()) + if (Instruction *UI = dyn_cast<Instruction>(U)) + OperandChangedState(UI); } // Process the basic block work list. @@ -1499,7 +1499,7 @@ namespace { /// Sparse Conditional Constant Propagator. /// struct SCCP : public FunctionPass { - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetLibraryInfo>(); } static char ID; // Pass identification, replacement for typeid @@ -1510,7 +1510,7 @@ namespace { // runOnFunction - Run the Sparse Conditional Constant Propagation // algorithm, and return true if the function was modified. // - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; }; } // end anonymous namespace @@ -1553,10 +1553,14 @@ static void DeleteInstructionInBlock(BasicBlock *BB) { // and return true if the function was modified. // bool SCCP::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); - const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); + const DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); - SCCPSolver Solver(TD, TLI); + SCCPSolver Solver(DL, TLI); // Mark the first block of the function as being executable. Solver.MarkBlockExecutable(F.begin()); @@ -1628,14 +1632,14 @@ namespace { /// Constant Propagation. /// struct IPSCCP : public ModulePass { - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetLibraryInfo>(); } static char ID; IPSCCP() : ModulePass(ID) { initializeIPSCCPPass(*PassRegistry::getPassRegistry()); } - bool runOnModule(Module &M); + bool runOnModule(Module &M) override; }; } // end anonymous namespace @@ -1658,21 +1662,20 @@ static bool AddressIsTaken(const GlobalValue *GV) { // Delete any dead constantexpr klingons. GV->removeDeadConstantUsers(); - for (Value::const_use_iterator UI = GV->use_begin(), E = GV->use_end(); - UI != E; ++UI) { - const User *U = *UI; - if (const StoreInst *SI = dyn_cast<StoreInst>(U)) { + for (const Use &U : GV->uses()) { + const User *UR = U.getUser(); + if (const StoreInst *SI = dyn_cast<StoreInst>(UR)) { if (SI->getOperand(0) == GV || SI->isVolatile()) return true; // Storing addr of GV. - } else if (isa<InvokeInst>(U) || isa<CallInst>(U)) { + } else if (isa<InvokeInst>(UR) || isa<CallInst>(UR)) { // Make sure we are calling the function, not passing the address. - ImmutableCallSite CS(cast<Instruction>(U)); - if (!CS.isCallee(UI)) + ImmutableCallSite CS(cast<Instruction>(UR)); + if (!CS.isCallee(&U)) return true; - } else if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { + } else if (const LoadInst *LI = dyn_cast<LoadInst>(UR)) { if (LI->isVolatile()) return true; - } else if (isa<BlockAddress>(U)) { + } else if (isa<BlockAddress>(UR)) { // blockaddress doesn't take the address of the function, it takes addr // of label. } else { @@ -1683,9 +1686,10 @@ static bool AddressIsTaken(const GlobalValue *GV) { } bool IPSCCP::runOnModule(Module &M) { - const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); - SCCPSolver Solver(TD, TLI); + SCCPSolver Solver(DL, TLI); // AddressTakenFunctions - This set keeps track of the address-taken functions // that are in the input. As IPSCCP runs through and simplifies code, @@ -1834,8 +1838,9 @@ bool IPSCCP::runOnModule(Module &M) { for (unsigned i = 0, e = BlocksToErase.size(); i != e; ++i) { // If there are any PHI nodes in this successor, drop entries for BB now. BasicBlock *DeadBB = BlocksToErase[i]; - for (Value::use_iterator UI = DeadBB->use_begin(), UE = DeadBB->use_end(); - UI != UE; ) { + for (Value::user_iterator UI = DeadBB->user_begin(), + UE = DeadBB->user_end(); + UI != UE;) { // Grab the user and then increment the iterator early, as the user // will be deleted. Step past all adjacent uses from the same user. Instruction *I = dyn_cast<Instruction>(*UI); @@ -1925,7 +1930,7 @@ bool IPSCCP::runOnModule(Module &M) { "Overdefined values should have been taken out of the map!"); DEBUG(dbgs() << "Found that GV '" << GV->getName() << "' is constant!\n"); while (!GV->use_empty()) { - StoreInst *SI = cast<StoreInst>(GV->use_back()); + StoreInst *SI = cast<StoreInst>(GV->user_back()); SI->eraseFromParent(); } M.getGlobalList().erase(GV); diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp index 9f3fc83..8c7f253 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp @@ -23,40 +23,48 @@ /// //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "sroa" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/DIBuilder.h" -#include "llvm/DebugInfo.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Operator.h" -#include "llvm/InstVisitor.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/TimeValue.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include "llvm/Transforms/Utils/SSAUpdater.h" + +#if __cplusplus >= 201103L && !defined(NDEBUG) +// We only use this for a debug check in C++11 +#include <random> +#endif + using namespace llvm; +#define DEBUG_TYPE "sroa" + STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed"); STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca"); @@ -73,6 +81,16 @@ STATISTIC(NumVectorized, "Number of vectorized aggregates"); static cl::opt<bool> ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden); +/// Hidden option to enable randomly shuffling the slices to help uncover +/// instability in their order. +static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices", + cl::init(false), cl::Hidden); + +/// Hidden option to experiment with completely strict handling of inbounds +/// GEPs. +static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", + cl::init(false), cl::Hidden); + namespace { /// \brief A custom IRBuilder inserter which prefixes all names if they are /// preserved. @@ -142,8 +160,8 @@ public: Use *getUse() const { return UseAndIsSplittable.getPointer(); } - bool isDead() const { return getUse() == 0; } - void kill() { UseAndIsSplittable.setPointer(0); } + bool isDead() const { return getUse() == nullptr; } + void kill() { UseAndIsSplittable.setPointer(nullptr); } /// \brief Support for ordering ranges. /// @@ -244,8 +262,8 @@ public: void printUse(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; void print(raw_ostream &OS) const; - void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const; - void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const; + void dump(const_iterator I) const; + void dump() const; #endif private: @@ -303,7 +321,7 @@ static Value *foldSelectInst(SelectInst &SI) { if (SI.getOperand(1) == SI.getOperand(2)) return SI.getOperand(1); - return 0; + return nullptr; } /// \brief Builder for the alloca slices. @@ -339,7 +357,7 @@ private: bool IsSplittable = false) { // Completely skip uses which have a zero size or start either before or // past the end of the allocation. - if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) { + if (Size == 0 || Offset.uge(AllocSize)) { DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset << " which has zero size or starts outside of the " << AllocSize << " byte alloca:\n" @@ -380,6 +398,43 @@ private: if (GEPI.use_empty()) return markAsDead(GEPI); + if (SROAStrictInbounds && GEPI.isInBounds()) { + // FIXME: This is a manually un-factored variant of the basic code inside + // of GEPs with checking of the inbounds invariant specified in the + // langref in a very strict sense. If we ever want to enable + // SROAStrictInbounds, this code should be factored cleanly into + // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds + // by writing out the code here where we have tho underlying allocation + // size readily available. + APInt GEPOffset = Offset; + for (gep_type_iterator GTI = gep_type_begin(GEPI), + GTE = gep_type_end(GEPI); + GTI != GTE; ++GTI) { + ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); + if (!OpC) + break; + + // Handle a struct index, which adds its field offset to the pointer. + if (StructType *STy = dyn_cast<StructType>(*GTI)) { + unsigned ElementIdx = OpC->getZExtValue(); + const StructLayout *SL = DL.getStructLayout(STy); + GEPOffset += + APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx)); + } else { + // For array or vector indices, scale the index by the size of the type. + APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth()); + GEPOffset += Index * APInt(Offset.getBitWidth(), + DL.getTypeAllocSize(GTI.getIndexedType())); + } + + // If this index has computed an intermediate pointer which is not + // inbounds, then the result of the GEP is a poison value and we can + // delete it and all uses. + if (GEPOffset.ugt(AllocSize)) + return markAsDead(GEPI); + } + } + return Base::visitGetElementPtrInst(GEPI); } @@ -426,8 +481,7 @@ private: // risk of overflow. // FIXME: We should instead consider the pointer to have escaped if this // function is being instrumented for addressing bugs or race conditions. - if (Offset.isNegative() || Size > AllocSize || - Offset.ugt(AllocSize - Size)) { + if (Size > AllocSize || Offset.ugt(AllocSize - Size)) { DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset << " which extends past the end of the " << AllocSize << " byte alloca:\n" @@ -446,7 +500,7 @@ private: assert(II.getRawDest() == *U && "Pointer use is not the destination?"); ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); if ((Length && Length->getValue() == 0) || - (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) + (IsOffsetKnown && Offset.uge(AllocSize))) // Zero-length mem transfer intrinsics can be ignored entirely. return markAsDead(II); @@ -461,14 +515,30 @@ private: void visitMemTransferInst(MemTransferInst &II) { ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); - if ((Length && Length->getValue() == 0) || - (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) + if (Length && Length->getValue() == 0) // Zero-length mem transfer intrinsics can be ignored entirely. return markAsDead(II); + // Because we can visit these intrinsics twice, also check to see if the + // first time marked this instruction as dead. If so, skip it. + if (VisitedDeadInsts.count(&II)) + return; + if (!IsOffsetKnown) return PI.setAborted(&II); + // This side of the transfer is completely out-of-bounds, and so we can + // nuke the entire transfer. However, we also need to nuke the other side + // if already added to our partitions. + // FIXME: Yet another place we really should bypass this when + // instrumenting for ASan. + if (Offset.uge(AllocSize)) { + SmallDenseMap<Instruction *, unsigned>::iterator MTPI = MemTransferSliceMap.find(&II); + if (MTPI != MemTransferSliceMap.end()) + S.Slices[MTPI->second].kill(); + return markAsDead(II); + } + uint64_t RawOffset = Offset.getLimitedValue(); uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset; @@ -487,7 +557,7 @@ private: // they both point to the same alloca. bool Inserted; SmallDenseMap<Instruction *, unsigned>::iterator MTPI; - llvm::tie(MTPI, Inserted) = + std::tie(MTPI, Inserted) = MemTransferSliceMap.insert(std::make_pair(&II, S.Slices.size())); unsigned PrevIdx = MTPI->second; if (!Inserted) { @@ -546,7 +616,7 @@ private: Size = 0; do { Instruction *I, *UsedI; - llvm::tie(UsedI, I) = Uses.pop_back_val(); + std::tie(UsedI, I) = Uses.pop_back_val(); if (LoadInst *LI = dyn_cast<LoadInst>(I)) { Size = std::max(Size, DL.getTypeStoreSize(LI->getType())); @@ -568,13 +638,12 @@ private: return I; } - for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE; - ++UI) - if (Visited.insert(cast<Instruction>(*UI))) - Uses.push_back(std::make_pair(I, cast<Instruction>(*UI))); + for (User *U : I->users()) + if (Visited.insert(cast<Instruction>(U))) + Uses.push_back(std::make_pair(I, cast<Instruction>(U))); } while (!Uses.empty()); - return 0; + return nullptr; } void visitPHINode(PHINode &PN) { @@ -597,8 +666,7 @@ private: // themselves which should be replaced with undef. // FIXME: This should instead be escaped in the event we're instrumenting // for address sanitization. - if ((Offset.isNegative() && (-Offset).uge(PHISize)) || - (!Offset.isNegative() && Offset.uge(AllocSize))) { + if (Offset.uge(AllocSize)) { S.DeadOperands.push_back(U); return; } @@ -638,8 +706,7 @@ private: // themselves which should be replaced with undef. // FIXME: This should instead be escaped in the event we're instrumenting // for address sanitization. - if ((Offset.isNegative() && Offset.uge(SelectSize)) || - (!Offset.isNegative() && Offset.uge(AllocSize))) { + if (Offset.uge(AllocSize)) { S.DeadOperands.push_back(U); return; } @@ -658,7 +725,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) AI(AI), #endif - PointerEscapingInstr(0) { + PointerEscapingInstr(nullptr) { SliceBuilder PB(DL, AI, *this); SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI); if (PtrI.isEscaped() || PtrI.isAborted()) { @@ -674,6 +741,13 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) std::mem_fun_ref(&Slice::isDead)), Slices.end()); +#if __cplusplus >= 201103L && !defined(NDEBUG) + if (SROARandomShuffleSlices) { + std::mt19937 MT(static_cast<unsigned>(sys::TimeValue::now().msec())); + std::shuffle(Slices.begin(), Slices.end(), MT); + } +#endif + // Sort the uses. This arranges for the offsets to be in ascending order, // and the sizes to be in descending order. std::sort(Slices.begin(), Slices.end()); @@ -712,8 +786,10 @@ void AllocaSlices::print(raw_ostream &OS) const { print(OS, I); } -void AllocaSlices::dump(const_iterator I) const { print(dbgs(), I); } -void AllocaSlices::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const { + print(dbgs(), I); +} +LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); } #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -741,12 +817,10 @@ public: // Retain the debug information attached to the alloca for use when // rewriting loads and stores. if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { - for (Value::use_iterator UI = DebugNode->use_begin(), - UE = DebugNode->use_end(); - UI != UE; ++UI) - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) + for (User *U : DebugNode->users()) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) DDIs.push_back(DDI); - else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI)) + else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) DVIs.push_back(DVI); } @@ -760,8 +834,8 @@ public: DVIs.pop_back_val()->eraseFromParent(); } - virtual bool isInstInList(Instruction *I, - const SmallVectorImpl<Instruction*> &Insts) const { + bool isInstInList(Instruction *I, + const SmallVectorImpl<Instruction*> &Insts) const override { Value *Ptr; if (LoadInst *LI = dyn_cast<LoadInst>(I)) Ptr = LI->getOperand(0); @@ -788,7 +862,7 @@ public: return false; } - virtual void updateDebugInfo(Instruction *Inst) const { + void updateDebugInfo(Instruction *Inst) const override { for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; @@ -800,7 +874,7 @@ public: for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; - Value *Arg = 0; + Value *Arg = nullptr; if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { // If an argument is zero extended then use argument directly. The ZExt // may be zapped by an optimization pass in future. @@ -896,13 +970,13 @@ class SROA : public FunctionPass { public: SROA(bool RequiresDomTree = true) : FunctionPass(ID), RequiresDomTree(RequiresDomTree), - C(0), DL(0), DT(0) { + C(nullptr), DL(nullptr), DT(nullptr) { initializeSROAPass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); - void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; - const char *getPassName() const { return "SROA"; } + const char *getPassName() const override { return "SROA"; } static char ID; private: @@ -915,6 +989,7 @@ private: ArrayRef<AllocaSlices::iterator> SplitUses); bool splitAlloca(AllocaInst &AI, AllocaSlices &S); bool runOnAlloca(AllocaInst &AI); + void clobberUse(Use &U); void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas); bool promoteAllocas(Function &F); }; @@ -928,7 +1003,7 @@ FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false, false) @@ -937,8 +1012,12 @@ INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", static Type *findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, uint64_t EndOffset) { - Type *Ty = 0; - bool IgnoreNonIntegralTypes = false; + Type *Ty = nullptr; + bool TyIsCommon = true; + IntegerType *ITy = nullptr; + + // Note that we need to look at *every* alloca slice's Use to ensure we + // always get consistent results regardless of the order of slices. for (AllocaSlices::const_iterator I = B; I != E; ++I) { Use *U = I->getUse(); if (isa<IntrinsicInst>(*U->getUser())) @@ -946,42 +1025,37 @@ static Type *findCommonType(AllocaSlices::const_iterator B, if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset) continue; - Type *UserTy = 0; + Type *UserTy = nullptr; if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { UserTy = LI->getType(); } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { UserTy = SI->getValueOperand()->getType(); - } else { - IgnoreNonIntegralTypes = true; // Give up on anything but an iN type. - continue; } - if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) { + if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) { // If the type is larger than the partition, skip it. We only encounter // this for split integer operations where we want to use the type of the // entity causing the split. Also skip if the type is not a byte width // multiple. - if (ITy->getBitWidth() % 8 != 0 || - ITy->getBitWidth() / 8 > (EndOffset - B->beginOffset())) + if (UserITy->getBitWidth() % 8 != 0 || + UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset())) continue; - // If we have found an integer type use covering the alloca, use that - // regardless of the other types, as integers are often used for - // a "bucket of bits" type. - // - // NB: This *must* be the only return from inside the loop so that the - // order of slices doesn't impact the computed type. - return ITy; - } else if (IgnoreNonIntegralTypes) { - continue; + // Track the largest bitwidth integer type used in this way in case there + // is no common type. + if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth()) + ITy = UserITy; } - if (Ty && Ty != UserTy) - IgnoreNonIntegralTypes = true; // Give up on anything but an iN type. - - Ty = UserTy; + // To avoid depending on the order of slices, Ty and TyIsCommon must not + // depend on types skipped above. + if (!UserTy || (Ty && Ty != UserTy)) + TyIsCommon = false; // Give up on anything but an iN type. + else + Ty = UserTy; } - return Ty; + + return TyIsCommon ? Ty : ITy; } /// PHI instructions that use an alloca and are subsequently loaded can be @@ -1003,7 +1077,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B, /// FIXME: This should be hoisted into a generic utility, likely in /// Transforms/Util/Local.h static bool isSafePHIToSpeculate(PHINode &PN, - const DataLayout *DL = 0) { + const DataLayout *DL = nullptr) { // For now, we can only do this promotion if the load is in the same block // as the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. @@ -1011,10 +1085,9 @@ static bool isSafePHIToSpeculate(PHINode &PN, BasicBlock *BB = PN.getParent(); unsigned MaxAlign = 0; bool HaveLoad = false; - for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); UI != UE; - ++UI) { - LoadInst *LI = dyn_cast<LoadInst>(*UI); - if (LI == 0 || !LI->isSimple()) + for (User *U : PN.users()) { + LoadInst *LI = dyn_cast<LoadInst>(U); + if (!LI || !LI->isSimple()) return false; // For now we only allow loads in the same block as the PHI. This is @@ -1057,7 +1130,7 @@ static bool isSafePHIToSpeculate(PHINode &PN, // If this pointer is always safe to load, or if we can prove that there // is already a load in the block, then we can move the load to the pred // block. - if (InVal->isDereferenceablePointer() || + if (InVal->isDereferenceablePointer(DL) || isSafeToLoadUnconditionally(InVal, TI, MaxAlign, DL)) continue; @@ -1077,13 +1150,13 @@ static void speculatePHINodeLoads(PHINode &PN) { // Get the TBAA tag and alignment to use from one of the loads. It doesn't // matter which one we get and if any differ. - LoadInst *SomeLoad = cast<LoadInst>(*PN.use_begin()); + LoadInst *SomeLoad = cast<LoadInst>(PN.user_back()); MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); unsigned Align = SomeLoad->getAlignment(); // Rewrite all loads of the PN to use the new PHI. while (!PN.use_empty()) { - LoadInst *LI = cast<LoadInst>(*PN.use_begin()); + LoadInst *LI = cast<LoadInst>(PN.user_back()); LI->replaceAllUsesWith(NewPN); LI->eraseFromParent(); } @@ -1121,16 +1194,16 @@ static void speculatePHINodeLoads(PHINode &PN) { /// /// We can do this to a select if its only uses are loads and if the operand /// to the select can be loaded unconditionally. -static bool isSafeSelectToSpeculate(SelectInst &SI, const DataLayout *DL = 0) { +static bool isSafeSelectToSpeculate(SelectInst &SI, + const DataLayout *DL = nullptr) { Value *TValue = SI.getTrueValue(); Value *FValue = SI.getFalseValue(); - bool TDerefable = TValue->isDereferenceablePointer(); - bool FDerefable = FValue->isDereferenceablePointer(); + bool TDerefable = TValue->isDereferenceablePointer(DL); + bool FDerefable = FValue->isDereferenceablePointer(DL); - for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); UI != UE; - ++UI) { - LoadInst *LI = dyn_cast<LoadInst>(*UI); - if (LI == 0 || !LI->isSimple()) + for (User *U : SI.users()) { + LoadInst *LI = dyn_cast<LoadInst>(U); + if (!LI || !LI->isSimple()) return false; // Both operands to the select need to be dereferencable, either @@ -1155,7 +1228,7 @@ static void speculateSelectInstLoads(SelectInst &SI) { Value *FV = SI.getFalseValue(); // Replace the loads of the select with a select of two loads. while (!SI.use_empty()) { - LoadInst *LI = cast<LoadInst>(*SI.use_begin()); + LoadInst *LI = cast<LoadInst>(SI.user_back()); assert(LI->isSimple() && "We only speculate simple loads"); IRB.SetInsertPoint(LI); @@ -1188,7 +1261,7 @@ static void speculateSelectInstLoads(SelectInst &SI) { /// This will return the BasePtr if that is valid, or build a new GEP /// instruction using the IRBuilder if GEP-ing is needed. static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, - SmallVectorImpl<Value *> &Indices) { + SmallVectorImpl<Value *> &Indices, Twine NamePrefix) { if (Indices.empty()) return BasePtr; @@ -1197,7 +1270,7 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) return BasePtr; - return IRB.CreateInBoundsGEP(BasePtr, Indices, "idx"); + return IRB.CreateInBoundsGEP(BasePtr, Indices, NamePrefix + "sroa_idx"); } /// \brief Get a natural GEP off of the BasePtr walking through Ty toward @@ -1211,9 +1284,13 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, /// indicated by Indices to have the correct offset. static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, Value *BasePtr, Type *Ty, Type *TargetTy, - SmallVectorImpl<Value *> &Indices) { + SmallVectorImpl<Value *> &Indices, + Twine NamePrefix) { if (Ty == TargetTy) - return buildGEP(IRB, BasePtr, Indices); + return buildGEP(IRB, BasePtr, Indices, NamePrefix); + + // Pointer size to use for the indices. + unsigned PtrSize = DL.getPointerTypeSizeInBits(BasePtr->getType()); // See if we can descend into a struct and locate a field with the correct // type. @@ -1222,11 +1299,13 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, do { if (ElementTy->isPointerTy()) break; - if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) { - ElementTy = SeqTy->getElementType(); - // Note that we use the default address space as this index is over an - // array or a vector, not a pointer. - Indices.push_back(IRB.getInt(APInt(DL.getPointerSizeInBits(0), 0))); + + if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) { + ElementTy = ArrayTy->getElementType(); + Indices.push_back(IRB.getIntN(PtrSize, 0)); + } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) { + ElementTy = VectorTy->getElementType(); + Indices.push_back(IRB.getInt32(0)); } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { if (STy->element_begin() == STy->element_end()) break; // Nothing left to descend into. @@ -1240,7 +1319,7 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, if (ElementTy != TargetTy) Indices.erase(Indices.end() - NumLayers, Indices.end()); - return buildGEP(IRB, BasePtr, Indices); + return buildGEP(IRB, BasePtr, Indices, NamePrefix); } /// \brief Recursively compute indices for a natural GEP. @@ -1250,29 +1329,32 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, Type *Ty, APInt &Offset, Type *TargetTy, - SmallVectorImpl<Value *> &Indices) { + SmallVectorImpl<Value *> &Indices, + Twine NamePrefix) { if (Offset == 0) - return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices); + return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, NamePrefix); // We can't recurse through pointer types. if (Ty->isPointerTy()) - return 0; + return nullptr; // We try to analyze GEPs over vectors here, but note that these GEPs are // extremely poorly defined currently. The long-term goal is to remove GEPing // over a vector from the IR completely. if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType()); - if (ElementSizeInBits % 8) - return 0; // GEPs over non-multiple of 8 size vector elements are invalid. + if (ElementSizeInBits % 8 != 0) { + // GEPs over non-multiple of 8 size vector elements are invalid. + return nullptr; + } APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); APInt NumSkippedElements = Offset.sdiv(ElementSize); if (NumSkippedElements.ugt(VecTy->getNumElements())) - return 0; + return nullptr; Offset -= NumSkippedElements * ElementSize; Indices.push_back(IRB.getInt(NumSkippedElements)); return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(), - Offset, TargetTy, Indices); + Offset, TargetTy, Indices, NamePrefix); } if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { @@ -1280,31 +1362,31 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy)); APInt NumSkippedElements = Offset.sdiv(ElementSize); if (NumSkippedElements.ugt(ArrTy->getNumElements())) - return 0; + return nullptr; Offset -= NumSkippedElements * ElementSize; Indices.push_back(IRB.getInt(NumSkippedElements)); return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, - Indices); + Indices, NamePrefix); } StructType *STy = dyn_cast<StructType>(Ty); if (!STy) - return 0; + return nullptr; const StructLayout *SL = DL.getStructLayout(STy); uint64_t StructOffset = Offset.getZExtValue(); if (StructOffset >= SL->getSizeInBytes()) - return 0; + return nullptr; unsigned Index = SL->getElementContainingOffset(StructOffset); Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); Type *ElementTy = STy->getElementType(Index); if (Offset.uge(DL.getTypeAllocSize(ElementTy))) - return 0; // The offset points into alignment padding. + return nullptr; // The offset points into alignment padding. Indices.push_back(IRB.getInt32(Index)); return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, - Indices); + Indices, NamePrefix); } /// \brief Get a natural GEP from a base pointer to a particular offset and @@ -1319,26 +1401,27 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, /// If no natural GEP can be constructed, this function returns null. static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *TargetTy, - SmallVectorImpl<Value *> &Indices) { + SmallVectorImpl<Value *> &Indices, + Twine NamePrefix) { PointerType *Ty = cast<PointerType>(Ptr->getType()); // Don't consider any GEPs through an i8* as natural unless the TargetTy is // an i8. - if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8)) - return 0; + if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8)) + return nullptr; Type *ElementTy = Ty->getElementType(); if (!ElementTy->isSized()) - return 0; // We can't GEP through an unsized element. + return nullptr; // We can't GEP through an unsized element. APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy)); if (ElementSize == 0) - return 0; // Zero-length arrays can't help us build a natural GEP. + return nullptr; // Zero-length arrays can't help us build a natural GEP. APInt NumSkippedElements = Offset.sdiv(ElementSize); Offset -= NumSkippedElements * ElementSize; Indices.push_back(IRB.getInt(NumSkippedElements)); return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, - Indices); + Indices, NamePrefix); } /// \brief Compute an adjusted pointer from Ptr by Offset bytes where the @@ -1356,8 +1439,9 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, /// properties. The algorithm tries to fold as many constant indices into /// a single GEP as possible, thus making each GEP more independent of the /// surrounding code. -static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, - Value *Ptr, APInt Offset, Type *PointerTy) { +static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, + APInt Offset, Type *PointerTy, + Twine NamePrefix) { // Even though we don't look through PHI nodes, we could be called on an // instruction in an unreachable block, which may be on a cycle. SmallPtrSet<Value *, 4> Visited; @@ -1367,11 +1451,11 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, // We may end up computing an offset pointer that has the wrong type. If we // never are able to compute one directly that has the correct type, we'll // fall back to it, so keep it around here. - Value *OffsetPtr = 0; + Value *OffsetPtr = nullptr; // Remember any i8 pointer we come across to re-use if we need to do a raw // byte offset. - Value *Int8Ptr = 0; + Value *Int8Ptr = nullptr; APInt Int8PtrOffset(Offset.getBitWidth(), 0); Type *TargetTy = PointerTy->getPointerElementType(); @@ -1391,7 +1475,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, // See if we can perform a natural GEP here. Indices.clear(); if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy, - Indices)) { + Indices, NamePrefix)) { if (P->getType() == PointerTy) { // Zap any offset pointer that we ended up computing in previous rounds. if (OffsetPtr && OffsetPtr->use_empty()) @@ -1425,20 +1509,21 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, if (!OffsetPtr) { if (!Int8Ptr) { - Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(), - "raw_cast"); + Int8Ptr = IRB.CreateBitCast( + Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()), + NamePrefix + "sroa_raw_cast"); Int8PtrOffset = Offset; } OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), - "raw_idx"); + NamePrefix + "sroa_raw_idx"); } Ptr = OffsetPtr; // On the off chance we were targeting i8*, guard the bitcast here. if (Ptr->getType() != PointerTy) - Ptr = IRB.CreateBitCast(Ptr, PointerTy, "cast"); + Ptr = IRB.CreateBitCast(Ptr, PointerTy, NamePrefix + "sroa_cast"); return Ptr; } @@ -1931,16 +2016,22 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> { // integer type will be stored here for easy access during rewriting. IntegerType *IntTy; - // The offset of the slice currently being rewritten. + // The original offset of the slice currently being rewritten relative to + // the original alloca. uint64_t BeginOffset, EndOffset; + // The new offsets of the slice currently being rewritten relative to the + // original alloca. + uint64_t NewBeginOffset, NewEndOffset; + + uint64_t SliceSize; bool IsSplittable; bool IsSplit; Use *OldUse; Instruction *OldPtr; - // Output members carrying state about the result of visiting and rewriting - // the slice of the alloca. - bool IsUsedByRewrittenSpeculatableInstructions; + // Track post-rewrite users which are PHI nodes and Selects. + SmallPtrSetImpl<PHINode *> &PHIUsers; + SmallPtrSetImpl<SelectInst *> &SelectUsers; // Utility IR builder, whose name prefix is setup for each visited use, and // the insertion point is set to point to the user. @@ -1949,22 +2040,25 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> { public: AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &S, SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI, - uint64_t NewBeginOffset, uint64_t NewEndOffset, - bool IsVectorPromotable = false, - bool IsIntegerPromotable = false) + uint64_t NewAllocaBeginOffset, + uint64_t NewAllocaEndOffset, bool IsVectorPromotable, + bool IsIntegerPromotable, + SmallPtrSetImpl<PHINode *> &PHIUsers, + SmallPtrSetImpl<SelectInst *> &SelectUsers) : DL(DL), S(S), Pass(Pass), OldAI(OldAI), NewAI(NewAI), - NewAllocaBeginOffset(NewBeginOffset), NewAllocaEndOffset(NewEndOffset), + NewAllocaBeginOffset(NewAllocaBeginOffset), + NewAllocaEndOffset(NewAllocaEndOffset), NewAllocaTy(NewAI.getAllocatedType()), - VecTy(IsVectorPromotable ? cast<VectorType>(NewAllocaTy) : 0), - ElementTy(VecTy ? VecTy->getElementType() : 0), + VecTy(IsVectorPromotable ? cast<VectorType>(NewAllocaTy) : nullptr), + ElementTy(VecTy ? VecTy->getElementType() : nullptr), ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0), IntTy(IsIntegerPromotable ? Type::getIntNTy( NewAI.getContext(), DL.getTypeSizeInBits(NewAI.getAllocatedType())) - : 0), + : nullptr), BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(), - OldPtr(), IsUsedByRewrittenSpeculatableInstructions(false), + OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers), IRB(NewAI.getContext(), ConstantFolder()) { if (VecTy) { assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 && @@ -1983,6 +2077,14 @@ public: IsSplit = BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset; + // Compute the intersecting offset range. + assert(BeginOffset < NewAllocaEndOffset); + assert(EndOffset > NewAllocaBeginOffset); + NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); + NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); + + SliceSize = NewEndOffset - NewBeginOffset; + OldUse = I->getUse(); OldPtr = cast<Instruction>(OldUse->get()); @@ -1997,20 +2099,6 @@ public: return CanSROA; } - /// \brief Query whether this slice is used by speculatable instructions after - /// rewriting. - /// - /// These instructions (PHIs and Selects currently) require the alloca slice - /// to run back through the rewriter. Thus, they are promotable, but not on - /// this iteration. This is distinct from a slice which is unpromotable for - /// some other reason, in which case we don't even want to perform the - /// speculation. This can be querried at any time and reflects whether (at - /// that point) a visit call has rewritten a speculatable instruction on the - /// current slice. - bool isUsedByRewrittenSpeculatableInstructions() const { - return IsUsedByRewrittenSpeculatableInstructions; - } - private: // Make sure the other visit overloads are visible. using Base::visit; @@ -2021,30 +2109,53 @@ private: llvm_unreachable("No rewrite rule for this instruction!"); } - Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, uint64_t Offset, - Type *PointerTy) { - assert(Offset >= NewAllocaBeginOffset); - return getAdjustedPtr(IRB, DL, &NewAI, APInt(DL.getPointerSizeInBits(), - Offset - NewAllocaBeginOffset), - PointerTy); + Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) { + // Note that the offset computation can use BeginOffset or NewBeginOffset + // interchangeably for unsplit slices. + assert(IsSplit || BeginOffset == NewBeginOffset); + uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; + +#ifndef NDEBUG + StringRef OldName = OldPtr->getName(); + // Skip through the last '.sroa.' component of the name. + size_t LastSROAPrefix = OldName.rfind(".sroa."); + if (LastSROAPrefix != StringRef::npos) { + OldName = OldName.substr(LastSROAPrefix + strlen(".sroa.")); + // Look for an SROA slice index. + size_t IndexEnd = OldName.find_first_not_of("0123456789"); + if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') { + // Strip the index and look for the offset. + OldName = OldName.substr(IndexEnd + 1); + size_t OffsetEnd = OldName.find_first_not_of("0123456789"); + if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.') + // Strip the offset. + OldName = OldName.substr(OffsetEnd + 1); + } + } + // Strip any SROA suffixes as well. + OldName = OldName.substr(0, OldName.find(".sroa_")); +#endif + + return getAdjustedPtr(IRB, DL, &NewAI, + APInt(DL.getPointerSizeInBits(), Offset), PointerTy, +#ifndef NDEBUG + Twine(OldName) + "." +#else + Twine() +#endif + ); } - /// \brief Compute suitable alignment to access an offset into the new alloca. - unsigned getOffsetAlign(uint64_t Offset) { + /// \brief Compute suitable alignment to access this slice of the *new* alloca. + /// + /// You can optionally pass a type to this routine and if that type's ABI + /// alignment is itself suitable, this will return zero. + unsigned getSliceAlign(Type *Ty = nullptr) { unsigned NewAIAlign = NewAI.getAlignment(); if (!NewAIAlign) NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType()); - return MinAlign(NewAIAlign, Offset); - } - - /// \brief Compute suitable alignment to access a type at an offset of the - /// new alloca. - /// - /// \returns zero if the type's ABI alignment is a suitable alignment, - /// otherwise returns the maximal suitable alignment. - unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) { - unsigned Align = getOffsetAlign(Offset); - return Align == DL.getABITypeAlignment(Ty) ? 0 : Align; + unsigned Align = MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset); + return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align; } unsigned getIndex(uint64_t Offset) { @@ -2062,8 +2173,7 @@ private: Pass.DeadInsts.insert(I); } - Value *rewriteVectorizedLoadInst(uint64_t NewBeginOffset, - uint64_t NewEndOffset) { + Value *rewriteVectorizedLoadInst() { unsigned BeginIndex = getIndex(NewBeginOffset); unsigned EndIndex = getIndex(NewEndOffset); assert(EndIndex > BeginIndex && "Empty vector!"); @@ -2073,8 +2183,7 @@ private: return extractVector(IRB, V, BeginIndex, EndIndex, "vec"); } - Value *rewriteIntegerLoad(LoadInst &LI, uint64_t NewBeginOffset, - uint64_t NewEndOffset) { + Value *rewriteIntegerLoad(LoadInst &LI) { assert(IntTy && "We cannot insert an integer to the alloca"); assert(!LI.isVolatile()); Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), @@ -2093,32 +2202,23 @@ private: Value *OldOp = LI.getOperand(0); assert(OldOp == OldPtr); - // Compute the intersecting offset range. - assert(BeginOffset < NewAllocaEndOffset); - assert(EndOffset > NewAllocaBeginOffset); - uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); - uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); - - uint64_t Size = NewEndOffset - NewBeginOffset; - - Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), Size * 8) + Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8) : LI.getType(); bool IsPtrAdjusted = false; Value *V; if (VecTy) { - V = rewriteVectorizedLoadInst(NewBeginOffset, NewEndOffset); + V = rewriteVectorizedLoadInst(); } else if (IntTy && LI.getType()->isIntegerTy()) { - V = rewriteIntegerLoad(LI, NewBeginOffset, NewEndOffset); + V = rewriteIntegerLoad(LI); } else if (NewBeginOffset == NewAllocaBeginOffset && canConvertValue(DL, NewAllocaTy, LI.getType())) { V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - LI.isVolatile(), "load"); + LI.isVolatile(), LI.getName()); } else { Type *LTy = TargetTy->getPointerTo(); - V = IRB.CreateAlignedLoad( - getAdjustedAllocaPtr(IRB, NewBeginOffset, LTy), - getOffsetTypeAlign(TargetTy, NewBeginOffset - NewAllocaBeginOffset), - LI.isVolatile(), "load"); + V = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy), + getSliceAlign(TargetTy), LI.isVolatile(), + LI.getName()); IsPtrAdjusted = true; } V = convertValue(DL, IRB, V, TargetTy); @@ -2127,13 +2227,13 @@ private: assert(!LI.isVolatile()); assert(LI.getType()->isIntegerTy() && "Only integer type loads and stores are split"); - assert(Size < DL.getTypeStoreSize(LI.getType()) && + assert(SliceSize < DL.getTypeStoreSize(LI.getType()) && "Split load isn't smaller than original load"); assert(LI.getType()->getIntegerBitWidth() == DL.getTypeStoreSizeInBits(LI.getType()) && "Non-byte-multiple bit width"); // Move the insertion point just past the load so that we can refer to it. - IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI))); + IRB.SetInsertPoint(std::next(BasicBlock::iterator(&LI))); // Create a placeholder value with the same type as LI to use as the // basis for the new value. This allows us to replace the uses of LI with // the computed value, and then replace the placeholder with LI, leaving @@ -2155,9 +2255,7 @@ private: return !LI.isVolatile() && !IsPtrAdjusted; } - bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp, - uint64_t NewBeginOffset, - uint64_t NewEndOffset) { + bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) { if (V->getType() != VecTy) { unsigned BeginIndex = getIndex(NewBeginOffset); unsigned EndIndex = getIndex(NewEndOffset); @@ -2183,8 +2281,7 @@ private: return true; } - bool rewriteIntegerStore(Value *V, StoreInst &SI, - uint64_t NewBeginOffset, uint64_t NewEndOffset) { + bool rewriteIntegerStore(Value *V, StoreInst &SI) { assert(IntTy && "We cannot extract an integer from the alloca"); assert(!SI.isVolatile()); if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { @@ -2217,30 +2314,22 @@ private: if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets())) Pass.PostPromotionWorklist.insert(AI); - // Compute the intersecting offset range. - assert(BeginOffset < NewAllocaEndOffset); - assert(EndOffset > NewAllocaBeginOffset); - uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); - uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); - - uint64_t Size = NewEndOffset - NewBeginOffset; - if (Size < DL.getTypeStoreSize(V->getType())) { + if (SliceSize < DL.getTypeStoreSize(V->getType())) { assert(!SI.isVolatile()); assert(V->getType()->isIntegerTy() && "Only integer type loads and stores are split"); assert(V->getType()->getIntegerBitWidth() == DL.getTypeStoreSizeInBits(V->getType()) && "Non-byte-multiple bit width"); - IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8); + IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8); V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset, "extract"); } if (VecTy) - return rewriteVectorizedStoreInst(V, SI, OldOp, NewBeginOffset, - NewEndOffset); + return rewriteVectorizedStoreInst(V, SI, OldOp); if (IntTy && V->getType()->isIntegerTy()) - return rewriteIntegerStore(V, SI, NewBeginOffset, NewEndOffset); + return rewriteIntegerStore(V, SI); StoreInst *NewSI; if (NewBeginOffset == NewAllocaBeginOffset && @@ -2250,12 +2339,9 @@ private: NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), SI.isVolatile()); } else { - Value *NewPtr = getAdjustedAllocaPtr(IRB, NewBeginOffset, - V->getType()->getPointerTo()); - NewSI = IRB.CreateAlignedStore( - V, NewPtr, getOffsetTypeAlign( - V->getType(), NewBeginOffset - NewAllocaBeginOffset), - SI.isVolatile()); + Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo()); + NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()), + SI.isVolatile()); } (void)NewSI; Pass.DeadInsts.insert(&SI); @@ -2307,11 +2393,10 @@ private: // pointer to the new alloca. if (!isa<Constant>(II.getLength())) { assert(!IsSplit); - assert(BeginOffset >= NewAllocaBeginOffset); - II.setDest( - getAdjustedAllocaPtr(IRB, BeginOffset, II.getRawDest()->getType())); + assert(NewBeginOffset == BeginOffset); + II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType())); Type *CstTy = II.getAlignmentCst()->getType(); - II.setAlignment(ConstantInt::get(CstTy, getOffsetAlign(BeginOffset))); + II.setAlignment(ConstantInt::get(CstTy, getSliceAlign())); deleteIfTriviallyDead(OldPtr); return false; @@ -2323,13 +2408,6 @@ private: Type *AllocaTy = NewAI.getAllocatedType(); Type *ScalarTy = AllocaTy->getScalarType(); - // Compute the intersecting offset range. - assert(BeginOffset < NewAllocaEndOffset); - assert(EndOffset > NewAllocaBeginOffset); - uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); - uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); - uint64_t SliceOffset = NewBeginOffset - NewAllocaBeginOffset; - // If this doesn't map cleanly onto the alloca type, and that type isn't // a single value type, just emit a memset. if (!VecTy && !IntTy && @@ -2341,8 +2419,8 @@ private: Type *SizeTy = II.getLength()->getType(); Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); CallInst *New = IRB.CreateMemSet( - getAdjustedAllocaPtr(IRB, NewBeginOffset, II.getRawDest()->getType()), - II.getValue(), Size, getOffsetAlign(SliceOffset), II.isVolatile()); + getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size, + getSliceAlign(), II.isVolatile()); (void)New; DEBUG(dbgs() << " to: " << *New << "\n"); return false; @@ -2419,25 +2497,11 @@ private: DEBUG(dbgs() << " original: " << II << "\n"); - // Compute the intersecting offset range. - assert(BeginOffset < NewAllocaEndOffset); - assert(EndOffset > NewAllocaBeginOffset); - uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); - uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); - - assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr); - bool IsDest = II.getRawDest() == OldPtr; + bool IsDest = &II.getRawDestUse() == OldUse; + assert((IsDest && II.getRawDest() == OldPtr) || + (!IsDest && II.getRawSource() == OldPtr)); - // Compute the relative offset within the transfer. - unsigned IntPtrWidth = DL.getPointerSizeInBits(); - APInt RelOffset(IntPtrWidth, NewBeginOffset - BeginOffset); - - unsigned Align = II.getAlignment(); - uint64_t SliceOffset = NewBeginOffset - NewAllocaBeginOffset; - if (Align > 1) - Align = - MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(), - MinAlign(II.getAlignment(), getOffsetAlign(SliceOffset))); + unsigned SliceAlign = getSliceAlign(); // For unsplit intrinsics, we simply modify the source and destination // pointers in place. This isn't just an optimization, it is a matter of @@ -2447,19 +2511,20 @@ private: // memcpy, and so simply updating the pointers is the necessary for us to // update both source and dest of a single call. if (!IsSplittable) { - Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource(); + Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); if (IsDest) - II.setDest( - getAdjustedAllocaPtr(IRB, BeginOffset, II.getRawDest()->getType())); + II.setDest(AdjustedPtr); else - II.setSource(getAdjustedAllocaPtr(IRB, BeginOffset, - II.getRawSource()->getType())); + II.setSource(AdjustedPtr); - Type *CstTy = II.getAlignmentCst()->getType(); - II.setAlignment(ConstantInt::get(CstTy, Align)); + if (II.getAlignment() > SliceAlign) { + Type *CstTy = II.getAlignmentCst()->getType(); + II.setAlignment( + ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign))); + } DEBUG(dbgs() << " to: " << II << "\n"); - deleteIfTriviallyDead(OldOp); + deleteIfTriviallyDead(OldPtr); return false; } // For split transfer intrinsics we have an incredibly useful assurance: @@ -2495,37 +2560,39 @@ private: // alloca that should be re-examined after rewriting this instruction. Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); if (AllocaInst *AI - = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) + = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) { + assert(AI != &OldAI && AI != &NewAI && + "Splittable transfers cannot reach the same alloca on both ends."); Pass.Worklist.insert(AI); + } - if (EmitMemCpy) { - Type *OtherPtrTy = IsDest ? II.getRawSource()->getType() - : II.getRawDest()->getType(); + Type *OtherPtrTy = OtherPtr->getType(); + unsigned OtherAS = OtherPtrTy->getPointerAddressSpace(); + // Compute the relative offset for the other pointer within the transfer. + unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS); + APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset); + unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1, + OtherOffset.zextOrTrunc(64).getZExtValue()); + + if (EmitMemCpy) { // Compute the other pointer, folding as much as possible to produce // a single, simple GEP in most cases. - OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, RelOffset, OtherPtrTy); + OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, + OtherPtr->getName() + "."); - Value *OurPtr = getAdjustedAllocaPtr( - IRB, NewBeginOffset, - IsDest ? II.getRawDest()->getType() : II.getRawSource()->getType()); + Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); Type *SizeTy = II.getLength()->getType(); Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); - CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr, - IsDest ? OtherPtr : OurPtr, - Size, Align, II.isVolatile()); + CallInst *New = IRB.CreateMemCpy( + IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size, + MinAlign(SliceAlign, OtherAlign), II.isVolatile()); (void)New; DEBUG(dbgs() << " to: " << *New << "\n"); return false; } - // Note that we clamp the alignment to 1 here as a 0 alignment for a memcpy - // is equivalent to 1, but that isn't true if we end up rewriting this as - // a load or store. - if (!Align) - Align = 1; - bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset && NewEndOffset == NewAllocaEndOffset; uint64_t Size = NewEndOffset - NewBeginOffset; @@ -2533,24 +2600,32 @@ private: unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0; unsigned NumElements = EndIndex - BeginIndex; IntegerType *SubIntTy - = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0; + = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : nullptr; - Type *OtherPtrTy = NewAI.getType(); + // Reset the other pointer type to match the register type we're going to + // use, but using the address space of the original other pointer. if (VecTy && !IsWholeAlloca) { if (NumElements == 1) OtherPtrTy = VecTy->getElementType(); else OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements); - OtherPtrTy = OtherPtrTy->getPointerTo(); + OtherPtrTy = OtherPtrTy->getPointerTo(OtherAS); } else if (IntTy && !IsWholeAlloca) { - OtherPtrTy = SubIntTy->getPointerTo(); + OtherPtrTy = SubIntTy->getPointerTo(OtherAS); + } else { + OtherPtrTy = NewAllocaTy->getPointerTo(OtherAS); } - Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, RelOffset, OtherPtrTy); + Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, + OtherPtr->getName() + "."); + unsigned SrcAlign = OtherAlign; Value *DstPtr = &NewAI; - if (!IsDest) + unsigned DstAlign = SliceAlign; + if (!IsDest) { std::swap(SrcPtr, DstPtr); + std::swap(SrcAlign, DstAlign); + } Value *Src; if (VecTy && !IsWholeAlloca && !IsDest) { @@ -2564,7 +2639,7 @@ private: uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract"); } else { - Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(), + Src = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload"); } @@ -2582,7 +2657,7 @@ private: } StoreInst *Store = cast<StoreInst>( - IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile())); + IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile())); (void)Store; DEBUG(dbgs() << " to: " << *Store << "\n"); return !II.isVolatile(); @@ -2594,20 +2669,13 @@ private: DEBUG(dbgs() << " original: " << II << "\n"); assert(II.getArgOperand(1) == OldPtr); - // Compute the intersecting offset range. - assert(BeginOffset < NewAllocaEndOffset); - assert(EndOffset > NewAllocaBeginOffset); - uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); - uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); - // Record this instruction for deletion. Pass.DeadInsts.insert(&II); ConstantInt *Size = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), NewEndOffset - NewBeginOffset); - Value *Ptr = - getAdjustedAllocaPtr(IRB, NewBeginOffset, II.getArgOperand(1)->getType()); + Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); Value *New; if (II.getIntrinsicID() == Intrinsic::lifetime_start) New = IRB.CreateLifetimeStart(Ptr, Size); @@ -2628,28 +2696,22 @@ private: // as local as possible to the PHI. To do that, we re-use the location of // the old pointer, which necessarily must be in the right position to // dominate the PHI. - IRBuilderTy PtrBuilder(OldPtr); - PtrBuilder.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + - "."); + IRBuilderTy PtrBuilder(IRB); + PtrBuilder.SetInsertPoint(OldPtr); + PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc()); - Value *NewPtr = - getAdjustedAllocaPtr(PtrBuilder, BeginOffset, OldPtr->getType()); + Value *NewPtr = getNewAllocaSlicePtr(PtrBuilder, OldPtr->getType()); // Replace the operands which were using the old pointer. std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); DEBUG(dbgs() << " to: " << PN << "\n"); deleteIfTriviallyDead(OldPtr); - // Check whether we can speculate this PHI node, and if so remember that - // fact and queue it up for another iteration after the speculation - // occurs. - if (isSafePHIToSpeculate(PN, &DL)) { - Pass.SpeculatablePHIs.insert(&PN); - IsUsedByRewrittenSpeculatableInstructions = true; - return true; - } - - return false; // PHIs can't be promoted on their own. + // PHIs can't be promoted on their own, but often can be speculated. We + // check the speculation outside of the rewriter so that we see the + // fully-rewritten alloca. + PHIUsers.insert(&PN); + return true; } bool visitSelectInst(SelectInst &SI) { @@ -2659,7 +2721,7 @@ private: assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable"); assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable"); - Value *NewPtr = getAdjustedAllocaPtr(IRB, BeginOffset, OldPtr->getType()); + Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); // Replace the operands which were using the old pointer. if (SI.getOperand(1) == OldPtr) SI.setOperand(1, NewPtr); @@ -2669,16 +2731,11 @@ private: DEBUG(dbgs() << " to: " << SI << "\n"); deleteIfTriviallyDead(OldPtr); - // Check whether we can speculate this select instruction, and if so - // remember that fact and queue it up for another iteration after the - // speculation occurs. - if (isSafeSelectToSpeculate(SI, &DL)) { - Pass.SpeculatableSelects.insert(&SI); - IsUsedByRewrittenSpeculatableInstructions = true; - return true; - } - - return false; // Selects can't be promoted on their own. + // Selects can't be promoted on their own, but often can be speculated. We + // check the speculation outside of the rewriter so that we see the + // fully-rewritten alloca. + SelectUsers.insert(&SI); + return true; } }; @@ -2726,10 +2783,9 @@ private: /// Enqueue all the users of the given instruction for further processing. /// This uses a set to de-duplicate users. void enqueueUsers(Instruction &I) { - for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; - ++UI) - if (Visited.insert(*UI)) - Queue.push_back(&UI.getUse()); + for (Use &U : I.uses()) + if (Visited.insert(U.getUser())) + Queue.push_back(&U); } // Conservative default is to not rewrite anything. @@ -2942,22 +2998,22 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, return stripAggregateTypeWrapping(DL, Ty); if (Offset > DL.getTypeAllocSize(Ty) || (DL.getTypeAllocSize(Ty) - Offset) < Size) - return 0; + return nullptr; if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) { // We can't partition pointers... if (SeqTy->isPointerTy()) - return 0; + return nullptr; Type *ElementTy = SeqTy->getElementType(); uint64_t ElementSize = DL.getTypeAllocSize(ElementTy); uint64_t NumSkippedElements = Offset / ElementSize; if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) { if (NumSkippedElements >= ArrTy->getNumElements()) - return 0; + return nullptr; } else if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) { if (NumSkippedElements >= VecTy->getNumElements()) - return 0; + return nullptr; } Offset -= NumSkippedElements * ElementSize; @@ -2965,7 +3021,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, if (Offset > 0 || Size < ElementSize) { // Bail if the partition ends in a different array element. if ((Offset + Size) > ElementSize) - return 0; + return nullptr; // Recurse through the element type trying to peel off offset bytes. return getTypePartition(DL, ElementTy, Offset, Size); } @@ -2976,20 +3032,20 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, assert(Size > ElementSize); uint64_t NumElements = Size / ElementSize; if (NumElements * ElementSize != Size) - return 0; + return nullptr; return ArrayType::get(ElementTy, NumElements); } StructType *STy = dyn_cast<StructType>(Ty); if (!STy) - return 0; + return nullptr; const StructLayout *SL = DL.getStructLayout(STy); if (Offset >= SL->getSizeInBytes()) - return 0; + return nullptr; uint64_t EndOffset = Offset + Size; if (EndOffset > SL->getSizeInBytes()) - return 0; + return nullptr; unsigned Index = SL->getElementContainingOffset(Offset); Offset -= SL->getElementOffset(Index); @@ -2997,12 +3053,12 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, Type *ElementTy = STy->getElementType(Index); uint64_t ElementSize = DL.getTypeAllocSize(ElementTy); if (Offset >= ElementSize) - return 0; // The offset points into alignment padding. + return nullptr; // The offset points into alignment padding. // See if any partition must be contained by the element. if (Offset > 0 || Size < ElementSize) { if ((Offset + Size) > ElementSize) - return 0; + return nullptr; return getTypePartition(DL, ElementTy, Offset, Size); } assert(Offset == 0); @@ -3015,14 +3071,14 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, if (EndOffset < SL->getSizeInBytes()) { unsigned EndIndex = SL->getElementContainingOffset(EndOffset); if (Index == EndIndex) - return 0; // Within a single element and its padding. + return nullptr; // Within a single element and its padding. // Don't try to form "natural" types if the elements don't line up with the // expected size. // FIXME: We could potentially recurse down through the last element in the // sub-struct to find a natural end point. if (SL->getElementOffset(EndIndex) != EndOffset) - return 0; + return nullptr; assert(Index < EndIndex); EE = STy->element_begin() + EndIndex; @@ -3033,7 +3089,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, STy->isPacked()); const StructLayout *SubSL = DL.getStructLayout(SubTy); if (Size != SubSL->getSizeInBytes()) - return 0; // The sub-struct doesn't have quite the size needed. + return nullptr; // The sub-struct doesn't have quite the size needed. return SubTy; } @@ -3058,7 +3114,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, // Try to compute a friendly type for this partition of the alloca. This // won't always succeed, in which case we fall back to a legal integer type // or an i8 array of an appropriate size. - Type *SliceTy = 0; + Type *SliceTy = nullptr; if (Type *CommonUseTy = findCommonType(B, E, EndOffset)) if (DL->getTypeAllocSize(CommonUseTy) >= SliceSize) SliceTy = CommonUseTy; @@ -3105,7 +3161,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, // the alloca's alignment unconstrained. if (Alignment <= DL->getABITypeAlignment(SliceTy)) Alignment = 0; - NewAI = new AllocaInst(SliceTy, 0, Alignment, + NewAI = new AllocaInst(SliceTy, nullptr, Alignment, AI.getName() + ".sroa." + Twine(B - S.begin()), &AI); ++NumNewAllocas; } @@ -3114,17 +3170,17 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, << "[" << BeginOffset << "," << EndOffset << ") to: " << *NewAI << "\n"); - // Track the high watermark on several worklists that are only relevant for + // Track the high watermark on the worklist as it is only relevant for // promoted allocas. We will reset it to this point if the alloca is not in // fact scheduled for promotion. unsigned PPWOldSize = PostPromotionWorklist.size(); - unsigned SPOldSize = SpeculatablePHIs.size(); - unsigned SSOldSize = SpeculatableSelects.size(); unsigned NumUses = 0; + SmallPtrSet<PHINode *, 8> PHIUsers; + SmallPtrSet<SelectInst *, 8> SelectUsers; AllocaSliceRewriter Rewriter(*DL, S, *this, AI, *NewAI, BeginOffset, EndOffset, IsVectorPromotable, - IsIntegerPromotable); + IsIntegerPromotable, PHIUsers, SelectUsers); bool Promotable = true; for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(), SUE = SplitUses.end(); @@ -3145,50 +3201,60 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, MaxUsesPerAllocaPartition = std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition); - if (Promotable && !Rewriter.isUsedByRewrittenSpeculatableInstructions()) { - DEBUG(dbgs() << " and queuing for promotion\n"); - PromotableAllocas.push_back(NewAI); - } else if (NewAI != &AI || - (Promotable && - Rewriter.isUsedByRewrittenSpeculatableInstructions())) { + // Now that we've processed all the slices in the new partition, check if any + // PHIs or Selects would block promotion. + for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(), + E = PHIUsers.end(); + I != E; ++I) + if (!isSafePHIToSpeculate(**I, DL)) { + Promotable = false; + PHIUsers.clear(); + SelectUsers.clear(); + break; + } + for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(), + E = SelectUsers.end(); + I != E; ++I) + if (!isSafeSelectToSpeculate(**I, DL)) { + Promotable = false; + PHIUsers.clear(); + SelectUsers.clear(); + break; + } + + if (Promotable) { + if (PHIUsers.empty() && SelectUsers.empty()) { + // Promote the alloca. + PromotableAllocas.push_back(NewAI); + } else { + // If we have either PHIs or Selects to speculate, add them to those + // worklists and re-queue the new alloca so that we promote in on the + // next iteration. + for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(), + E = PHIUsers.end(); + I != E; ++I) + SpeculatablePHIs.insert(*I); + for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(), + E = SelectUsers.end(); + I != E; ++I) + SpeculatableSelects.insert(*I); + Worklist.insert(NewAI); + } + } else { // If we can't promote the alloca, iterate on it to check for new // refinements exposed by splitting the current alloca. Don't iterate on an // alloca which didn't actually change and didn't get promoted. - // - // Alternatively, if we could promote the alloca but have speculatable - // instructions then we will speculate them after finishing our processing - // of the original alloca. Mark the new one for re-visiting in the next - // iteration so the speculated operations can be rewritten. - // - // FIXME: We should actually track whether the rewriter changed anything. - Worklist.insert(NewAI); - } - - // Drop any post-promotion work items if promotion didn't happen. - if (!Promotable) { + if (NewAI != &AI) + Worklist.insert(NewAI); + + // Drop any post-promotion work items if promotion didn't happen. while (PostPromotionWorklist.size() > PPWOldSize) PostPromotionWorklist.pop_back(); - while (SpeculatablePHIs.size() > SPOldSize) - SpeculatablePHIs.pop_back(); - while (SpeculatableSelects.size() > SSOldSize) - SpeculatableSelects.pop_back(); } return true; } -namespace { -struct IsSliceEndLessOrEqualTo { - uint64_t UpperBound; - - IsSliceEndLessOrEqualTo(uint64_t UpperBound) : UpperBound(UpperBound) {} - - bool operator()(const AllocaSlices::iterator &I) { - return I->endOffset() <= UpperBound; - } -}; -} - static void removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses, uint64_t &MaxSplitUseEndOffset, uint64_t Offset) { @@ -3200,7 +3266,9 @@ removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses, size_t SplitUsesOldSize = SplitUses.size(); SplitUses.erase(std::remove_if(SplitUses.begin(), SplitUses.end(), - IsSliceEndLessOrEqualTo(Offset)), + [Offset](const AllocaSlices::iterator &I) { + return I->endOffset() <= Offset; + }), SplitUses.end()); if (SplitUsesOldSize == SplitUses.size()) return; @@ -3227,7 +3295,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &S) { uint64_t BeginOffset = S.begin()->beginOffset(); - for (AllocaSlices::iterator SI = S.begin(), SJ = llvm::next(SI), SE = S.end(); + for (AllocaSlices::iterator SI = S.begin(), SJ = std::next(SI), SE = S.end(); SI != SE; SI = SJ) { uint64_t MaxEndOffset = SI->endOffset(); @@ -3326,6 +3394,21 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &S) { return Changed; } +/// \brief Clobber a use with undef, deleting the used value if it becomes dead. +void SROA::clobberUse(Use &U) { + Value *OldV = U; + // Replace the use with an undef value. + U = UndefValue::get(OldV->getType()); + + // Check for this making an instruction dead. We have to garbage collect + // all the dead instructions to ensure the uses of any alloca end up being + // minimal. + if (Instruction *OldI = dyn_cast<Instruction>(OldV)) + if (isInstructionTriviallyDead(OldI)) { + DeadInsts.insert(OldI); + } +} + /// \brief Analyze an alloca for SROA. /// /// This analyzes the alloca to ensure we can reason about it, builds @@ -3363,21 +3446,22 @@ bool SROA::runOnAlloca(AllocaInst &AI) { for (AllocaSlices::dead_user_iterator DI = S.dead_user_begin(), DE = S.dead_user_end(); DI != DE; ++DI) { - Changed = true; + // Free up everything used by this instruction. + for (Use &DeadOp : (*DI)->operands()) + clobberUse(DeadOp); + + // Now replace the uses of this instruction. (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType())); + + // And mark it for deletion. DeadInsts.insert(*DI); + Changed = true; } for (AllocaSlices::dead_op_iterator DO = S.dead_op_begin(), DE = S.dead_op_end(); DO != DE; ++DO) { - Value *OldV = **DO; - // Clobber the use with an undef value. - **DO = UndefValue::get(OldV->getType()); - if (Instruction *OldI = dyn_cast<Instruction>(OldV)) - if (isInstructionTriviallyDead(OldI)) { - Changed = true; - DeadInsts.insert(OldI); - } + clobberUse(**DO); + Changed = true; } // No slices to split. Leave the dead alloca for a later pass to clean up. @@ -3413,10 +3497,10 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { I->replaceAllUsesWith(UndefValue::get(I->getType())); - for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) - if (Instruction *U = dyn_cast<Instruction>(*OI)) { + for (Use &Operand : I->operands()) + if (Instruction *U = dyn_cast<Instruction>(Operand)) { // Zero out the operand and see if it becomes trivially dead. - *OI = 0; + Operand = nullptr; if (isInstructionTriviallyDead(U)) DeadInsts.insert(U); } @@ -3432,10 +3516,9 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { static void enqueueUsersInWorklist(Instruction &I, SmallVectorImpl<Instruction *> &Worklist, SmallPtrSet<Instruction *, 8> &Visited) { - for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; - ++UI) - if (Visited.insert(cast<Instruction>(*UI))) - Worklist.push_back(cast<Instruction>(*UI)); + for (User *U : I.users()) + if (Visited.insert(cast<Instruction>(U))) + Worklist.push_back(cast<Instruction>(U)); } /// \brief Promote the allocas, using the best available technique. @@ -3521,32 +3604,24 @@ bool SROA::promoteAllocas(Function &F) { return true; } -namespace { - /// \brief A predicate to test whether an alloca belongs to a set. - class IsAllocaInSet { - typedef SmallPtrSet<AllocaInst *, 4> SetType; - const SetType &Set; - - public: - typedef AllocaInst *argument_type; - - IsAllocaInSet(const SetType &Set) : Set(Set) {} - bool operator()(AllocaInst *AI) const { return Set.count(AI); } - }; -} - bool SROA::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); - DL = getAnalysisIfAvailable<DataLayout>(); - if (!DL) { + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + if (!DLP) { DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); return false; } - DT = getAnalysisIfAvailable<DominatorTree>(); + DL = &DLP->getDataLayout(); + DominatorTreeWrapperPass *DTWP = + getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; BasicBlock &EntryBB = F.getEntryBlock(); - for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end()); + for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); I != E; ++I) if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) Worklist.insert(AI); @@ -3564,11 +3639,14 @@ bool SROA::runOnFunction(Function &F) { // Remove the deleted allocas from various lists so that we don't try to // continue processing them. if (!DeletedAllocas.empty()) { - Worklist.remove_if(IsAllocaInSet(DeletedAllocas)); - PostPromotionWorklist.remove_if(IsAllocaInSet(DeletedAllocas)); + auto IsInSet = [&](AllocaInst *AI) { + return DeletedAllocas.count(AI); + }; + Worklist.remove_if(IsInSet); + PostPromotionWorklist.remove_if(IsInSet); PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), PromotableAllocas.end(), - IsAllocaInSet(DeletedAllocas)), + IsInSet), PromotableAllocas.end()); DeletedAllocas.clear(); } @@ -3585,6 +3663,6 @@ bool SROA::runOnFunction(Function &F) { void SROA::getAnalysisUsage(AnalysisUsage &AU) const { if (RequiresDomTree) - AU.addRequired<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp b/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp index 9bcd702..73c97ff 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp @@ -22,38 +22,198 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "sample-profile" - +#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/OwningPtr.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" -#include "llvm/DebugInfo/DIContext.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/PostDominators.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Metadata.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/InstIterator.h" +#include "llvm/Support/LineIterator.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Regex.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar.h" +#include <cctype> using namespace llvm; +#define DEBUG_TYPE "sample-profile" + // Command line option to specify the file to read samples from. This is // mainly used for debugging. static cl::opt<std::string> SampleProfileFile( "sample-profile-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile file loaded by -sample-profile"), cl::Hidden); +static cl::opt<unsigned> SampleProfileMaxPropagateIterations( + "sample-profile-max-propagate-iterations", cl::init(100), + cl::desc("Maximum number of iterations to go through when propagating " + "sample block/edge weights through the CFG.")); + +namespace { +/// \brief Represents the relative location of an instruction. +/// +/// Instruction locations are specified by the line offset from the +/// beginning of the function (marked by the line where the function +/// header is) and the discriminator value within that line. +/// +/// The discriminator value is useful to distinguish instructions +/// that are on the same line but belong to different basic blocks +/// (e.g., the two post-increment instructions in "if (p) x++; else y++;"). +struct InstructionLocation { + InstructionLocation(int L, unsigned D) : LineOffset(L), Discriminator(D) {} + int LineOffset; + unsigned Discriminator; +}; +} + +namespace llvm { +template <> struct DenseMapInfo<InstructionLocation> { + typedef DenseMapInfo<int> OffsetInfo; + typedef DenseMapInfo<unsigned> DiscriminatorInfo; + static inline InstructionLocation getEmptyKey() { + return InstructionLocation(OffsetInfo::getEmptyKey(), + DiscriminatorInfo::getEmptyKey()); + } + static inline InstructionLocation getTombstoneKey() { + return InstructionLocation(OffsetInfo::getTombstoneKey(), + DiscriminatorInfo::getTombstoneKey()); + } + static inline unsigned getHashValue(InstructionLocation Val) { + return DenseMapInfo<std::pair<int, unsigned>>::getHashValue( + std::pair<int, unsigned>(Val.LineOffset, Val.Discriminator)); + } + static inline bool isEqual(InstructionLocation LHS, InstructionLocation RHS) { + return LHS.LineOffset == RHS.LineOffset && + LHS.Discriminator == RHS.Discriminator; + } +}; +} namespace { +typedef DenseMap<InstructionLocation, unsigned> BodySampleMap; +typedef DenseMap<BasicBlock *, unsigned> BlockWeightMap; +typedef DenseMap<BasicBlock *, BasicBlock *> EquivalenceClassMap; +typedef std::pair<BasicBlock *, BasicBlock *> Edge; +typedef DenseMap<Edge, unsigned> EdgeWeightMap; +typedef DenseMap<BasicBlock *, SmallVector<BasicBlock *, 8>> BlockEdgeMap; + +/// \brief Representation of the runtime profile for a function. +/// +/// This data structure contains the runtime profile for a given +/// function. It contains the total number of samples collected +/// in the function and a map of samples collected in every statement. +class SampleFunctionProfile { +public: + SampleFunctionProfile() + : TotalSamples(0), TotalHeadSamples(0), HeaderLineno(0), DT(nullptr), + PDT(nullptr), LI(nullptr), Ctx(nullptr) {} + + unsigned getFunctionLoc(Function &F); + bool emitAnnotations(Function &F, DominatorTree *DomTree, + PostDominatorTree *PostDomTree, LoopInfo *Loops); + unsigned getInstWeight(Instruction &I); + unsigned getBlockWeight(BasicBlock *B); + void addTotalSamples(unsigned Num) { TotalSamples += Num; } + void addHeadSamples(unsigned Num) { TotalHeadSamples += Num; } + void addBodySamples(int LineOffset, unsigned Discriminator, unsigned Num) { + assert(LineOffset >= 0); + BodySamples[InstructionLocation(LineOffset, Discriminator)] += Num; + } + void print(raw_ostream &OS); + void printEdgeWeight(raw_ostream &OS, Edge E); + void printBlockWeight(raw_ostream &OS, BasicBlock *BB); + void printBlockEquivalence(raw_ostream &OS, BasicBlock *BB); + bool computeBlockWeights(Function &F); + void findEquivalenceClasses(Function &F); + void findEquivalencesFor(BasicBlock *BB1, + SmallVector<BasicBlock *, 8> Descendants, + DominatorTreeBase<BasicBlock> *DomTree); + void propagateWeights(Function &F); + unsigned visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); + void buildEdges(Function &F); + bool propagateThroughEdges(Function &F); + bool empty() { return BodySamples.empty(); } + +protected: + /// \brief Total number of samples collected inside this function. + /// + /// Samples are cumulative, they include all the samples collected + /// inside this function and all its inlined callees. + unsigned TotalSamples; + + /// \brief Total number of samples collected at the head of the function. + /// FIXME: Use head samples to estimate a cold/hot attribute for the function. + unsigned TotalHeadSamples; + + /// \brief Line number for the function header. Used to compute relative + /// line numbers from the absolute line LOCs found in instruction locations. + /// The relative line numbers are needed to address the samples from the + /// profile file. + unsigned HeaderLineno; + + /// \brief Map line offsets to collected samples. + /// + /// Each entry in this map contains the number of samples + /// collected at the corresponding line offset. All line locations + /// are an offset from the start of the function. + BodySampleMap BodySamples; + + /// \brief Map basic blocks to their computed weights. + /// + /// The weight of a basic block is defined to be the maximum + /// of all the instruction weights in that block. + BlockWeightMap BlockWeights; + + /// \brief Map edges to their computed weights. + /// + /// Edge weights are computed by propagating basic block weights in + /// SampleProfile::propagateWeights. + EdgeWeightMap EdgeWeights; + + /// \brief Set of visited blocks during propagation. + SmallPtrSet<BasicBlock *, 128> VisitedBlocks; + + /// \brief Set of visited edges during propagation. + SmallSet<Edge, 128> VisitedEdges; + + /// \brief Equivalence classes for block weights. + /// + /// Two blocks BB1 and BB2 are in the same equivalence class if they + /// dominate and post-dominate each other, and they are in the same loop + /// nest. When this happens, the two blocks are guaranteed to execute + /// the same number of times. + EquivalenceClassMap EquivalenceClass; + + /// \brief Dominance, post-dominance and loop information. + DominatorTree *DT; + PostDominatorTree *PDT; + LoopInfo *LI; + + /// \brief Predecessors for each basic block in the CFG. + BlockEdgeMap Predecessors; + + /// \brief Successors for each basic block in the CFG. + BlockEdgeMap Successors; + + /// \brief LLVM context holding the debug data we need. + LLVMContext *Ctx; +}; + /// \brief Sample-based profile reader. /// /// Each profile contains sample counts for all the functions @@ -77,61 +237,33 @@ namespace { /// 2. The samples collected at each line in F. To provide some /// protection against source code shuffling, line numbers should /// be relative to the start of the function. -class SampleProfile { +class SampleModuleProfile { public: - SampleProfile(StringRef F) : Profiles(0), Filename(F) {} + SampleModuleProfile(const Module &M, StringRef F) + : Profiles(0), Filename(F), M(M) {} void dump(); - void loadText(); + bool loadText(); void loadNative() { llvm_unreachable("not implemented"); } - bool emitAnnotations(Function &F); void printFunctionProfile(raw_ostream &OS, StringRef FName); void dumpFunctionProfile(StringRef FName); + SampleFunctionProfile &getProfile(const Function &F) { + return Profiles[F.getName()]; + } -protected: - typedef DenseMap<uint32_t, uint32_t> BodySampleMap; - typedef DenseMap<BasicBlock *, uint32_t> BlockWeightMap; - - /// \brief Representation of the runtime profile for a function. - /// - /// This data structure contains the runtime profile for a given - /// function. It contains the total number of samples collected - /// in the function and a map of samples collected in every statement. - struct FunctionProfile { - /// \brief Total number of samples collected inside this function. - /// - /// Samples are cumulative, they include all the samples collected - /// inside this function and all its inlined callees. - unsigned TotalSamples; - - // \brief Total number of samples collected at the head of the function. - unsigned TotalHeadSamples; - - /// \brief Map line offsets to collected samples. - /// - /// Each entry in this map contains the number of samples - /// collected at the corresponding line offset. All line locations - /// are an offset from the start of the function. - BodySampleMap BodySamples; - - /// \brief Map basic blocks to their computed weights. - /// - /// The weight of a basic block is defined to be the maximum - /// of all the instruction weights in that block. - BlockWeightMap BlockWeights; - }; - - uint32_t getInstWeight(Instruction &I, unsigned FirstLineno, - BodySampleMap &BodySamples); - uint32_t computeBlockWeight(BasicBlock *B, unsigned FirstLineno, - BodySampleMap &BodySamples); + /// \brief Report a parse error message. + void reportParseError(int64_t LineNumber, Twine Msg) const { + DiagnosticInfoSampleProfile Diag(Filename.data(), LineNumber, Msg); + M.getContext().diagnose(Diag); + } +protected: /// \brief Map every function to its associated profile. /// /// The profile of every function executed at runtime is collected - /// in the structure FunctionProfile. This maps function objects + /// in the structure SampleFunctionProfile. This maps function objects /// to their corresponding profiles. - StringMap<FunctionProfile> Profiles; + StringMap<SampleFunctionProfile> Profiles; /// \brief Path name to the file holding the profile data. /// @@ -140,63 +272,10 @@ protected: /// version of the profile format to be used in constructing test /// cases and debugging. StringRef Filename; -}; -/// \brief Loader class for text-based profiles. -/// -/// This class defines a simple interface to read text files containing -/// profiles. It keeps track of line number information and location of -/// the file pointer. Users of this class are responsible for actually -/// parsing the lines returned by the readLine function. -/// -/// TODO - This does not really belong here. It is a generic text file -/// reader. It should be moved to the Support library and made more general. -class ExternalProfileTextLoader { -public: - ExternalProfileTextLoader(StringRef F) : Filename(F) { - error_code EC; - EC = MemoryBuffer::getFile(Filename, Buffer); - if (EC) - report_fatal_error("Could not open profile file " + Filename + ": " + - EC.message()); - FP = Buffer->getBufferStart(); - Lineno = 0; - } - - /// \brief Read a line from the mapped file. - StringRef readLine() { - size_t Length = 0; - const char *start = FP; - while (FP != Buffer->getBufferEnd() && *FP != '\n') { - Length++; - FP++; - } - if (FP != Buffer->getBufferEnd()) - FP++; - Lineno++; - return StringRef(start, Length); - } - - /// \brief Return true, if we've reached EOF. - bool atEOF() const { return FP == Buffer->getBufferEnd(); } - - /// \brief Report a parse error message and stop compilation. - void reportParseError(Twine Msg) const { - report_fatal_error(Filename + ":" + Twine(Lineno) + ": " + Msg + "\n"); - } - -private: - /// \brief Memory buffer holding the text file. - OwningPtr<MemoryBuffer> Buffer; - - /// \brief Current position into the memory buffer. - const char *FP; - - /// \brief Current line number. - int64_t Lineno; - - /// \brief Path name where to the profile file. - StringRef Filename; + /// \brief Module being compiled. Used mainly to access the current + /// LLVM context for diagnostics. + const Module &M; }; /// \brief Sample profile pass. @@ -210,148 +289,242 @@ public: static char ID; SampleProfileLoader(StringRef Name = SampleProfileFile) - : FunctionPass(ID), Profiler(0), Filename(Name) { + : FunctionPass(ID), Profiler(), Filename(Name), ProfileIsValid(false) { initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry()); } - virtual bool doInitialization(Module &M); + bool doInitialization(Module &M) override; void dump() { Profiler->dump(); } - virtual const char *getPassName() const { return "Sample profile pass"; } + const char *getPassName() const override { return "Sample profile pass"; } - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired<LoopInfo>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<PostDominatorTree>(); } protected: /// \brief Profile reader object. - OwningPtr<SampleProfile> Profiler; + std::unique_ptr<SampleModuleProfile> Profiler; /// \brief Name of the profile file to load. StringRef Filename; + + /// \brief Flag indicating whether the profile input loaded successfully. + bool ProfileIsValid; }; } -/// \brief Print the function profile for \p FName on stream \p OS. +/// \brief Print this function profile on stream \p OS. /// /// \param OS Stream to emit the output to. -/// \param FName Name of the function to print. -void SampleProfile::printFunctionProfile(raw_ostream &OS, StringRef FName) { - FunctionProfile FProfile = Profiles[FName]; - OS << "Function: " << FName << ", " << FProfile.TotalSamples << ", " - << FProfile.TotalHeadSamples << ", " << FProfile.BodySamples.size() +void SampleFunctionProfile::print(raw_ostream &OS) { + OS << TotalSamples << ", " << TotalHeadSamples << ", " << BodySamples.size() << " sampled lines\n"; - for (BodySampleMap::const_iterator SI = FProfile.BodySamples.begin(), - SE = FProfile.BodySamples.end(); + for (BodySampleMap::const_iterator SI = BodySamples.begin(), + SE = BodySamples.end(); SI != SE; ++SI) - OS << "\tline offset: " << SI->first + OS << "\tline offset: " << SI->first.LineOffset + << ", discriminator: " << SI->first.Discriminator << ", number of samples: " << SI->second << "\n"; OS << "\n"; } +/// \brief Print the weight of edge \p E on stream \p OS. +/// +/// \param OS Stream to emit the output to. +/// \param E Edge to print. +void SampleFunctionProfile::printEdgeWeight(raw_ostream &OS, Edge E) { + OS << "weight[" << E.first->getName() << "->" << E.second->getName() + << "]: " << EdgeWeights[E] << "\n"; +} + +/// \brief Print the equivalence class of block \p BB on stream \p OS. +/// +/// \param OS Stream to emit the output to. +/// \param BB Block to print. +void SampleFunctionProfile::printBlockEquivalence(raw_ostream &OS, + BasicBlock *BB) { + BasicBlock *Equiv = EquivalenceClass[BB]; + OS << "equivalence[" << BB->getName() + << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n"; +} + +/// \brief Print the weight of block \p BB on stream \p OS. +/// +/// \param OS Stream to emit the output to. +/// \param BB Block to print. +void SampleFunctionProfile::printBlockWeight(raw_ostream &OS, BasicBlock *BB) { + OS << "weight[" << BB->getName() << "]: " << BlockWeights[BB] << "\n"; +} + +/// \brief Print the function profile for \p FName on stream \p OS. +/// +/// \param OS Stream to emit the output to. +/// \param FName Name of the function to print. +void SampleModuleProfile::printFunctionProfile(raw_ostream &OS, + StringRef FName) { + OS << "Function: " << FName << ":\n"; + Profiles[FName].print(OS); +} + /// \brief Dump the function profile for \p FName. /// /// \param FName Name of the function to print. -void SampleProfile::dumpFunctionProfile(StringRef FName) { +void SampleModuleProfile::dumpFunctionProfile(StringRef FName) { printFunctionProfile(dbgs(), FName); } /// \brief Dump all the function profiles found. -void SampleProfile::dump() { - for (StringMap<FunctionProfile>::const_iterator I = Profiles.begin(), - E = Profiles.end(); +void SampleModuleProfile::dump() { + for (StringMap<SampleFunctionProfile>::const_iterator I = Profiles.begin(), + E = Profiles.end(); I != E; ++I) dumpFunctionProfile(I->getKey()); } /// \brief Load samples from a text file. /// -/// The file is divided in two segments: -/// -/// Symbol table (represented with the string "symbol table") -/// Number of symbols in the table -/// symbol 1 -/// symbol 2 -/// ... -/// symbol N +/// The file contains a list of samples for every function executed at +/// runtime. Each function profile has the following format: /// -/// Function body profiles -/// function1:total_samples:total_head_samples:number_of_locations -/// location_offset_1: number_of_samples -/// location_offset_2: number_of_samples +/// function1:total_samples:total_head_samples +/// offset1[.discriminator]: number_of_samples [fn1:num fn2:num ... ] +/// offset2[.discriminator]: number_of_samples [fn3:num fn4:num ... ] /// ... -/// location_offset_N: number_of_samples +/// offsetN[.discriminator]: number_of_samples [fn5:num fn6:num ... ] /// /// Function names must be mangled in order for the profile loader to -/// match them in the current translation unit. +/// match them in the current translation unit. The two numbers in the +/// function header specify how many total samples were accumulated in +/// the function (first number), and the total number of samples accumulated +/// at the prologue of the function (second number). This head sample +/// count provides an indicator of how frequent is the function invoked. +/// +/// Each sampled line may contain several items. Some are optional +/// (marked below): +/// +/// a- Source line offset. This number represents the line number +/// in the function where the sample was collected. The line number +/// is always relative to the line where symbol of the function +/// is defined. So, if the function has its header at line 280, +/// the offset 13 is at line 293 in the file. +/// +/// b- [OPTIONAL] Discriminator. This is used if the sampled program +/// was compiled with DWARF discriminator support +/// (http://wiki.dwarfstd.org/index.php?title=Path_Discriminators) +/// +/// c- Number of samples. This is the number of samples collected by +/// the profiler at this source location. +/// +/// d- [OPTIONAL] Potential call targets and samples. If present, this +/// line contains a call instruction. This models both direct and +/// indirect calls. Each called target is listed together with the +/// number of samples. For example, +/// +/// 130: 7 foo:3 bar:2 baz:7 +/// +/// The above means that at relative line offset 130 there is a +/// call instruction that calls one of foo(), bar() and baz(). With +/// baz() being the relatively more frequent call target. +/// +/// FIXME: This is currently unhandled, but it has a lot of +/// potential for aiding the inliner. +/// /// /// Since this is a flat profile, a function that shows up more than /// once gets all its samples aggregated across all its instances. -/// TODO - flat profiles are too imprecise to provide good optimization -/// opportunities. Convert them to context-sensitive profile. +/// +/// FIXME: flat profiles are too imprecise to provide good optimization +/// opportunities. Convert them to context-sensitive profile. /// /// This textual representation is useful to generate unit tests and /// for debugging purposes, but it should not be used to generate /// profiles for large programs, as the representation is extremely /// inefficient. -void SampleProfile::loadText() { - ExternalProfileTextLoader Loader(Filename); - - // Read the symbol table. - StringRef Line = Loader.readLine(); - if (Line != "symbol table") - Loader.reportParseError("Expected 'symbol table', found " + Line); - int NumSymbols; - Line = Loader.readLine(); - if (Line.getAsInteger(10, NumSymbols)) - Loader.reportParseError("Expected a number, found " + Line); - for (int I = 0; I < NumSymbols; I++) { - StringRef FName = Loader.readLine(); - FunctionProfile &FProfile = Profiles[FName]; - FProfile.BodySamples.clear(); - FProfile.TotalSamples = 0; - FProfile.TotalHeadSamples = 0; +/// +/// \returns true if the file was loaded successfully, false otherwise. +bool SampleModuleProfile::loadText() { + ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr = + MemoryBuffer::getFile(Filename); + if (std::error_code EC = BufferOrErr.getError()) { + std::string Msg(EC.message()); + M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg)); + return false; } + std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get()); + line_iterator LineIt(*Buffer, '#'); // Read the profile of each function. Since each function may be // mentioned more than once, and we are collecting flat profiles, // accumulate samples as we parse them. - Regex HeadRE("^([^:]+):([0-9]+):([0-9]+):([0-9]+)$"); - Regex LineSample("^([0-9]+): ([0-9]+)$"); - while (!Loader.atEOF()) { - SmallVector<StringRef, 4> Matches; - Line = Loader.readLine(); - if (!HeadRE.match(Line, &Matches)) - Loader.reportParseError("Expected 'mangled_name:NUM:NUM:NUM', found " + - Line); - assert(Matches.size() == 5); + Regex HeadRE("^([^0-9].*):([0-9]+):([0-9]+)$"); + Regex LineSample("^([0-9]+)\\.?([0-9]+)?: ([0-9]+)(.*)$"); + while (!LineIt.is_at_eof()) { + // Read the header of each function. + // + // Note that for function identifiers we are actually expecting + // mangled names, but we may not always get them. This happens when + // the compiler decides not to emit the function (e.g., it was inlined + // and removed). In this case, the binary will not have the linkage + // name for the function, so the profiler will emit the function's + // unmangled name, which may contain characters like ':' and '>' in its + // name (member functions, templates, etc). + // + // The only requirement we place on the identifier, then, is that it + // should not begin with a number. + SmallVector<StringRef, 3> Matches; + if (!HeadRE.match(*LineIt, &Matches)) { + reportParseError(LineIt.line_number(), + "Expected 'mangled_name:NUM:NUM', found " + *LineIt); + return false; + } + assert(Matches.size() == 4); StringRef FName = Matches[1]; - unsigned NumSamples, NumHeadSamples, NumSampledLines; + unsigned NumSamples, NumHeadSamples; Matches[2].getAsInteger(10, NumSamples); Matches[3].getAsInteger(10, NumHeadSamples); - Matches[4].getAsInteger(10, NumSampledLines); - FunctionProfile &FProfile = Profiles[FName]; - FProfile.TotalSamples += NumSamples; - FProfile.TotalHeadSamples += NumHeadSamples; - BodySampleMap &SampleMap = FProfile.BodySamples; - unsigned I; - for (I = 0; I < NumSampledLines && !Loader.atEOF(); I++) { - Line = Loader.readLine(); - if (!LineSample.match(Line, &Matches)) - Loader.reportParseError("Expected 'NUM: NUM', found " + Line); - assert(Matches.size() == 3); - unsigned LineOffset, NumSamples; + Profiles[FName] = SampleFunctionProfile(); + SampleFunctionProfile &FProfile = Profiles[FName]; + FProfile.addTotalSamples(NumSamples); + FProfile.addHeadSamples(NumHeadSamples); + ++LineIt; + + // Now read the body. The body of the function ends when we reach + // EOF or when we see the start of the next function. + while (!LineIt.is_at_eof() && isdigit((*LineIt)[0])) { + if (!LineSample.match(*LineIt, &Matches)) { + reportParseError( + LineIt.line_number(), + "Expected 'NUM[.NUM]: NUM[ mangled_name:NUM]*', found " + *LineIt); + return false; + } + assert(Matches.size() == 5); + unsigned LineOffset, NumSamples, Discriminator = 0; Matches[1].getAsInteger(10, LineOffset); - Matches[2].getAsInteger(10, NumSamples); - SampleMap[LineOffset] += NumSamples; - } + if (Matches[2] != "") + Matches[2].getAsInteger(10, Discriminator); + Matches[3].getAsInteger(10, NumSamples); - if (I < NumSampledLines) - Loader.reportParseError("Unexpected end of file"); + // FIXME: Handle called targets (in Matches[4]). + + // When dealing with instruction weights, we use the value + // zero to indicate the absence of a sample. If we read an + // actual zero from the profile file, return it as 1 to + // avoid the confusion later on. + if (NumSamples == 0) + NumSamples = 1; + FProfile.addBodySamples(LineOffset, Discriminator, NumSamples); + ++LineIt; + } } + + return true; } /// \brief Get the weight for an instruction. @@ -359,46 +532,49 @@ void SampleProfile::loadText() { /// The "weight" of an instruction \p Inst is the number of samples /// collected on that instruction at runtime. To retrieve it, we /// need to compute the line number of \p Inst relative to the start of its -/// function. We use \p FirstLineno to compute the offset. We then -/// look up the samples collected for \p Inst using \p BodySamples. +/// function. We use HeaderLineno to compute the offset. We then +/// look up the samples collected for \p Inst using BodySamples. /// /// \param Inst Instruction to query. -/// \param FirstLineno Line number of the first instruction in the function. -/// \param BodySamples Map of relative source line locations to samples. /// /// \returns The profiled weight of I. -uint32_t SampleProfile::getInstWeight(Instruction &Inst, unsigned FirstLineno, - BodySampleMap &BodySamples) { - unsigned LOffset = Inst.getDebugLoc().getLine() - FirstLineno + 1; - return BodySamples.lookup(LOffset); +unsigned SampleFunctionProfile::getInstWeight(Instruction &Inst) { + DebugLoc DLoc = Inst.getDebugLoc(); + unsigned Lineno = DLoc.getLine(); + if (Lineno < HeaderLineno) + return 0; + + DILocation DIL(DLoc.getAsMDNode(*Ctx)); + int LOffset = Lineno - HeaderLineno; + unsigned Discriminator = DIL.getDiscriminator(); + unsigned Weight = + BodySamples.lookup(InstructionLocation(LOffset, Discriminator)); + DEBUG(dbgs() << " " << Lineno << "." << Discriminator << ":" << Inst + << " (line offset: " << LOffset << "." << Discriminator + << " - weight: " << Weight << ")\n"); + return Weight; } /// \brief Compute the weight of a basic block. /// /// The weight of basic block \p B is the maximum weight of all the -/// instructions in B. +/// instructions in B. The weight of \p B is computed and cached in +/// the BlockWeights map. /// /// \param B The basic block to query. -/// \param FirstLineno The line number for the first line in the -/// function holding B. -/// \param BodySamples The map containing all the samples collected in that -/// function. /// /// \returns The computed weight of B. -uint32_t SampleProfile::computeBlockWeight(BasicBlock *B, unsigned FirstLineno, - BodySampleMap &BodySamples) { +unsigned SampleFunctionProfile::getBlockWeight(BasicBlock *B) { // If we've computed B's weight before, return it. - Function *F = B->getParent(); - FunctionProfile &FProfile = Profiles[F->getName()]; std::pair<BlockWeightMap::iterator, bool> Entry = - FProfile.BlockWeights.insert(std::make_pair(B, 0)); + BlockWeights.insert(std::make_pair(B, 0)); if (!Entry.second) return Entry.first->second; // Otherwise, compute and cache B's weight. - uint32_t Weight = 0; + unsigned Weight = 0; for (BasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) { - uint32_t InstWeight = getInstWeight(*I, FirstLineno, BodySamples); + unsigned InstWeight = getInstWeight(*I); if (InstWeight > Weight) Weight = InstWeight; } @@ -406,31 +582,344 @@ uint32_t SampleProfile::computeBlockWeight(BasicBlock *B, unsigned FirstLineno, return Weight; } -/// \brief Generate branch weight metadata for all branches in \p F. +/// \brief Compute and store the weights of every basic block. +/// +/// This populates the BlockWeights map by computing +/// the weights of every basic block in the CFG. +/// +/// \param F The function to query. +bool SampleFunctionProfile::computeBlockWeights(Function &F) { + bool Changed = false; + DEBUG(dbgs() << "Block weights\n"); + for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) { + unsigned Weight = getBlockWeight(B); + Changed |= (Weight > 0); + DEBUG(printBlockWeight(dbgs(), B)); + } + + return Changed; +} + +/// \brief Find equivalence classes for the given block. /// -/// For every branch instruction B in \p F, we compute the weight of the -/// target block for each of the edges out of B. This is the weight -/// that we associate with that branch. +/// This finds all the blocks that are guaranteed to execute the same +/// number of times as \p BB1. To do this, it traverses all the the +/// descendants of \p BB1 in the dominator or post-dominator tree. /// -/// TODO - This weight assignment will most likely be wrong if the -/// target branch has more than two predecessors. This needs to be done -/// using some form of flow propagation. +/// A block BB2 will be in the same equivalence class as \p BB1 if +/// the following holds: /// -/// Once all the branch weights are computed, we emit the MD_prof -/// metadata on B using the computed values. +/// 1- \p BB1 is a descendant of BB2 in the opposite tree. So, if BB2 +/// is a descendant of \p BB1 in the dominator tree, then BB2 should +/// dominate BB1 in the post-dominator tree. +/// +/// 2- Both BB2 and \p BB1 must be in the same loop. +/// +/// For every block BB2 that meets those two requirements, we set BB2's +/// equivalence class to \p BB1. +/// +/// \param BB1 Block to check. +/// \param Descendants Descendants of \p BB1 in either the dom or pdom tree. +/// \param DomTree Opposite dominator tree. If \p Descendants is filled +/// with blocks from \p BB1's dominator tree, then +/// this is the post-dominator tree, and vice versa. +void SampleFunctionProfile::findEquivalencesFor( + BasicBlock *BB1, SmallVector<BasicBlock *, 8> Descendants, + DominatorTreeBase<BasicBlock> *DomTree) { + for (SmallVectorImpl<BasicBlock *>::iterator I = Descendants.begin(), + E = Descendants.end(); + I != E; ++I) { + BasicBlock *BB2 = *I; + bool IsDomParent = DomTree->dominates(BB2, BB1); + bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2); + if (BB1 != BB2 && VisitedBlocks.insert(BB2) && IsDomParent && + IsInSameLoop) { + EquivalenceClass[BB2] = BB1; + + // If BB2 is heavier than BB1, make BB2 have the same weight + // as BB1. + // + // Note that we don't worry about the opposite situation here + // (when BB2 is lighter than BB1). We will deal with this + // during the propagation phase. Right now, we just want to + // make sure that BB1 has the largest weight of all the + // members of its equivalence set. + unsigned &BB1Weight = BlockWeights[BB1]; + unsigned &BB2Weight = BlockWeights[BB2]; + BB1Weight = std::max(BB1Weight, BB2Weight); + } + } +} + +/// \brief Find equivalence classes. +/// +/// Since samples may be missing from blocks, we can fill in the gaps by setting +/// the weights of all the blocks in the same equivalence class to the same +/// weight. To compute the concept of equivalence, we use dominance and loop +/// information. Two blocks B1 and B2 are in the same equivalence class if B1 +/// dominates B2, B2 post-dominates B1 and both are in the same loop. /// /// \param F The function to query. -bool SampleProfile::emitAnnotations(Function &F) { +void SampleFunctionProfile::findEquivalenceClasses(Function &F) { + SmallVector<BasicBlock *, 8> DominatedBBs; + DEBUG(dbgs() << "\nBlock equivalence classes\n"); + // Find equivalence sets based on dominance and post-dominance information. + for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) { + BasicBlock *BB1 = B; + + // Compute BB1's equivalence class once. + if (EquivalenceClass.count(BB1)) { + DEBUG(printBlockEquivalence(dbgs(), BB1)); + continue; + } + + // By default, blocks are in their own equivalence class. + EquivalenceClass[BB1] = BB1; + + // Traverse all the blocks dominated by BB1. We are looking for + // every basic block BB2 such that: + // + // 1- BB1 dominates BB2. + // 2- BB2 post-dominates BB1. + // 3- BB1 and BB2 are in the same loop nest. + // + // If all those conditions hold, it means that BB2 is executed + // as many times as BB1, so they are placed in the same equivalence + // class by making BB2's equivalence class be BB1. + DominatedBBs.clear(); + DT->getDescendants(BB1, DominatedBBs); + findEquivalencesFor(BB1, DominatedBBs, PDT->DT); + + // Repeat the same logic for all the blocks post-dominated by BB1. + // We are looking for every basic block BB2 such that: + // + // 1- BB1 post-dominates BB2. + // 2- BB2 dominates BB1. + // 3- BB1 and BB2 are in the same loop nest. + // + // If all those conditions hold, BB2's equivalence class is BB1. + DominatedBBs.clear(); + PDT->getDescendants(BB1, DominatedBBs); + findEquivalencesFor(BB1, DominatedBBs, DT); + + DEBUG(printBlockEquivalence(dbgs(), BB1)); + } + + // Assign weights to equivalence classes. + // + // All the basic blocks in the same equivalence class will execute + // the same number of times. Since we know that the head block in + // each equivalence class has the largest weight, assign that weight + // to all the blocks in that equivalence class. + DEBUG(dbgs() << "\nAssign the same weight to all blocks in the same class\n"); + for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) { + BasicBlock *BB = B; + BasicBlock *EquivBB = EquivalenceClass[BB]; + if (BB != EquivBB) + BlockWeights[BB] = BlockWeights[EquivBB]; + DEBUG(printBlockWeight(dbgs(), BB)); + } +} + +/// \brief Visit the given edge to decide if it has a valid weight. +/// +/// If \p E has not been visited before, we copy to \p UnknownEdge +/// and increment the count of unknown edges. +/// +/// \param E Edge to visit. +/// \param NumUnknownEdges Current number of unknown edges. +/// \param UnknownEdge Set if E has not been visited before. +/// +/// \returns E's weight, if known. Otherwise, return 0. +unsigned SampleFunctionProfile::visitEdge(Edge E, unsigned *NumUnknownEdges, + Edge *UnknownEdge) { + if (!VisitedEdges.count(E)) { + (*NumUnknownEdges)++; + *UnknownEdge = E; + return 0; + } + + return EdgeWeights[E]; +} + +/// \brief Propagate weights through incoming/outgoing edges. +/// +/// If the weight of a basic block is known, and there is only one edge +/// with an unknown weight, we can calculate the weight of that edge. +/// +/// Similarly, if all the edges have a known count, we can calculate the +/// count of the basic block, if needed. +/// +/// \param F Function to process. +/// +/// \returns True if new weights were assigned to edges or blocks. +bool SampleFunctionProfile::propagateThroughEdges(Function &F) { bool Changed = false; - FunctionProfile &FProfile = Profiles[F.getName()]; - unsigned FirstLineno = inst_begin(F)->getDebugLoc().getLine(); - MDBuilder MDB(F.getContext()); + DEBUG(dbgs() << "\nPropagation through edges\n"); + for (Function::iterator BI = F.begin(), EI = F.end(); BI != EI; ++BI) { + BasicBlock *BB = BI; + + // Visit all the predecessor and successor edges to determine + // which ones have a weight assigned already. Note that it doesn't + // matter that we only keep track of a single unknown edge. The + // only case we are interested in handling is when only a single + // edge is unknown (see setEdgeOrBlockWeight). + for (unsigned i = 0; i < 2; i++) { + unsigned TotalWeight = 0; + unsigned NumUnknownEdges = 0; + Edge UnknownEdge, SelfReferentialEdge; + + if (i == 0) { + // First, visit all predecessor edges. + for (size_t I = 0; I < Predecessors[BB].size(); I++) { + Edge E = std::make_pair(Predecessors[BB][I], BB); + TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge); + if (E.first == E.second) + SelfReferentialEdge = E; + } + } else { + // On the second round, visit all successor edges. + for (size_t I = 0; I < Successors[BB].size(); I++) { + Edge E = std::make_pair(BB, Successors[BB][I]); + TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge); + } + } + + // After visiting all the edges, there are three cases that we + // can handle immediately: + // + // - All the edge weights are known (i.e., NumUnknownEdges == 0). + // In this case, we simply check that the sum of all the edges + // is the same as BB's weight. If not, we change BB's weight + // to match. Additionally, if BB had not been visited before, + // we mark it visited. + // + // - Only one edge is unknown and BB has already been visited. + // In this case, we can compute the weight of the edge by + // subtracting the total block weight from all the known + // edge weights. If the edges weight more than BB, then the + // edge of the last remaining edge is set to zero. + // + // - There exists a self-referential edge and the weight of BB is + // known. In this case, this edge can be based on BB's weight. + // We add up all the other known edges and set the weight on + // the self-referential edge as we did in the previous case. + // + // In any other case, we must continue iterating. Eventually, + // all edges will get a weight, or iteration will stop when + // it reaches SampleProfileMaxPropagateIterations. + if (NumUnknownEdges <= 1) { + unsigned &BBWeight = BlockWeights[BB]; + if (NumUnknownEdges == 0) { + // If we already know the weight of all edges, the weight of the + // basic block can be computed. It should be no larger than the sum + // of all edge weights. + if (TotalWeight > BBWeight) { + BBWeight = TotalWeight; + Changed = true; + DEBUG(dbgs() << "All edge weights for " << BB->getName() + << " known. Set weight for block: "; + printBlockWeight(dbgs(), BB);); + } + if (VisitedBlocks.insert(BB)) + Changed = true; + } else if (NumUnknownEdges == 1 && VisitedBlocks.count(BB)) { + // If there is a single unknown edge and the block has been + // visited, then we can compute E's weight. + if (BBWeight >= TotalWeight) + EdgeWeights[UnknownEdge] = BBWeight - TotalWeight; + else + EdgeWeights[UnknownEdge] = 0; + VisitedEdges.insert(UnknownEdge); + Changed = true; + DEBUG(dbgs() << "Set weight for edge: "; + printEdgeWeight(dbgs(), UnknownEdge)); + } + } else if (SelfReferentialEdge.first && VisitedBlocks.count(BB)) { + unsigned &BBWeight = BlockWeights[BB]; + // We have a self-referential edge and the weight of BB is known. + if (BBWeight >= TotalWeight) + EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight; + else + EdgeWeights[SelfReferentialEdge] = 0; + VisitedEdges.insert(SelfReferentialEdge); + Changed = true; + DEBUG(dbgs() << "Set self-referential edge weight to: "; + printEdgeWeight(dbgs(), SelfReferentialEdge)); + } + } + } + + return Changed; +} + +/// \brief Build in/out edge lists for each basic block in the CFG. +/// +/// We are interested in unique edges. If a block B1 has multiple +/// edges to another block B2, we only add a single B1->B2 edge. +void SampleFunctionProfile::buildEdges(Function &F) { + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { + BasicBlock *B1 = I; + + // Add predecessors for B1. + SmallPtrSet<BasicBlock *, 16> Visited; + if (!Predecessors[B1].empty()) + llvm_unreachable("Found a stale predecessors list in a basic block."); + for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); PI != PE; ++PI) { + BasicBlock *B2 = *PI; + if (Visited.insert(B2)) + Predecessors[B1].push_back(B2); + } + + // Add successors for B1. + Visited.clear(); + if (!Successors[B1].empty()) + llvm_unreachable("Found a stale successors list in a basic block."); + for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); SI != SE; ++SI) { + BasicBlock *B2 = *SI; + if (Visited.insert(B2)) + Successors[B1].push_back(B2); + } + } +} - // Clear the block weights cache. - FProfile.BlockWeights.clear(); +/// \brief Propagate weights into edges +/// +/// The following rules are applied to every block B in the CFG: +/// +/// - If B has a single predecessor/successor, then the weight +/// of that edge is the weight of the block. +/// +/// - If all incoming or outgoing edges are known except one, and the +/// weight of the block is already known, the weight of the unknown +/// edge will be the weight of the block minus the sum of all the known +/// edges. If the sum of all the known edges is larger than B's weight, +/// we set the unknown edge weight to zero. +/// +/// - If there is a self-referential edge, and the weight of the block is +/// known, the weight for that edge is set to the weight of the block +/// minus the weight of the other incoming edges to that block (if +/// known). +void SampleFunctionProfile::propagateWeights(Function &F) { + bool Changed = true; + unsigned i = 0; + + // Before propagation starts, build, for each block, a list of + // unique predecessors and successors. This is necessary to handle + // identical edges in multiway branches. Since we visit all blocks and all + // edges of the CFG, it is cleaner to build these lists once at the start + // of the pass. + buildEdges(F); + + // Propagate until we converge or we go past the iteration limit. + while (Changed && i++ < SampleProfileMaxPropagateIterations) { + Changed = propagateThroughEdges(F); + } - // When we find a branch instruction: For each edge E out of the branch, - // the weight of E is the weight of the target block. + // Generate MD_prof metadata for every branch instruction using the + // edge weights computed during propagation. + DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n"); + MDBuilder MDB(F.getContext()); for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { BasicBlock *B = I; TerminatorInst *TI = B->getTerminator(); @@ -439,34 +928,155 @@ bool SampleProfile::emitAnnotations(Function &F) { if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI)) continue; - SmallVector<uint32_t, 4> Weights; - unsigned NSuccs = TI->getNumSuccessors(); - for (unsigned I = 0; I < NSuccs; ++I) { + DEBUG(dbgs() << "\nGetting weights for branch at line " + << TI->getDebugLoc().getLine() << ".\n"); + SmallVector<unsigned, 4> Weights; + bool AllWeightsZero = true; + for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { BasicBlock *Succ = TI->getSuccessor(I); - uint32_t Weight = - computeBlockWeight(Succ, FirstLineno, FProfile.BodySamples); + Edge E = std::make_pair(B, Succ); + unsigned Weight = EdgeWeights[E]; + DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E)); Weights.push_back(Weight); + if (Weight != 0) + AllWeightsZero = false; } - TI->setMetadata(llvm::LLVMContext::MD_prof, - MDB.createBranchWeights(Weights)); - Changed = true; + // Only set weights if there is at least one non-zero weight. + // In any other case, let the analyzer set weights. + if (!AllWeightsZero) { + DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n"); + TI->setMetadata(llvm::LLVMContext::MD_prof, + MDB.createBranchWeights(Weights)); + } else { + DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n"); + } } +} - return Changed; +/// \brief Get the line number for the function header. +/// +/// This looks up function \p F in the current compilation unit and +/// retrieves the line number where the function is defined. This is +/// line 0 for all the samples read from the profile file. Every line +/// number is relative to this line. +/// +/// \param F Function object to query. +/// +/// \returns the line number where \p F is defined. If it returns 0, +/// it means that there is no debug information available for \p F. +unsigned SampleFunctionProfile::getFunctionLoc(Function &F) { + NamedMDNode *CUNodes = F.getParent()->getNamedMetadata("llvm.dbg.cu"); + if (CUNodes) { + for (unsigned I = 0, E1 = CUNodes->getNumOperands(); I != E1; ++I) { + DICompileUnit CU(CUNodes->getOperand(I)); + DIArray Subprograms = CU.getSubprograms(); + for (unsigned J = 0, E2 = Subprograms.getNumElements(); J != E2; ++J) { + DISubprogram Subprogram(Subprograms.getElement(J)); + if (Subprogram.describes(&F)) + return Subprogram.getLineNumber(); + } + } + } + + F.getContext().diagnose(DiagnosticInfoSampleProfile( + "No debug information found in function " + F.getName())); + return 0; } -char SampleProfileLoader::ID = 0; -INITIALIZE_PASS(SampleProfileLoader, "sample-profile", "Sample Profile loader", - false, false) +/// \brief Generate branch weight metadata for all branches in \p F. +/// +/// Branch weights are computed out of instruction samples using a +/// propagation heuristic. Propagation proceeds in 3 phases: +/// +/// 1- Assignment of block weights. All the basic blocks in the function +/// are initial assigned the same weight as their most frequently +/// executed instruction. +/// +/// 2- Creation of equivalence classes. Since samples may be missing from +/// blocks, we can fill in the gaps by setting the weights of all the +/// blocks in the same equivalence class to the same weight. To compute +/// the concept of equivalence, we use dominance and loop information. +/// Two blocks B1 and B2 are in the same equivalence class if B1 +/// dominates B2, B2 post-dominates B1 and both are in the same loop. +/// +/// 3- Propagation of block weights into edges. This uses a simple +/// propagation heuristic. The following rules are applied to every +/// block B in the CFG: +/// +/// - If B has a single predecessor/successor, then the weight +/// of that edge is the weight of the block. +/// +/// - If all the edges are known except one, and the weight of the +/// block is already known, the weight of the unknown edge will +/// be the weight of the block minus the sum of all the known +/// edges. If the sum of all the known edges is larger than B's weight, +/// we set the unknown edge weight to zero. +/// +/// - If there is a self-referential edge, and the weight of the block is +/// known, the weight for that edge is set to the weight of the block +/// minus the weight of the other incoming edges to that block (if +/// known). +/// +/// Since this propagation is not guaranteed to finalize for every CFG, we +/// only allow it to proceed for a limited number of iterations (controlled +/// by -sample-profile-max-propagate-iterations). +/// +/// FIXME: Try to replace this propagation heuristic with a scheme +/// that is guaranteed to finalize. A work-list approach similar to +/// the standard value propagation algorithm used by SSA-CCP might +/// work here. +/// +/// Once all the branch weights are computed, we emit the MD_prof +/// metadata on B using the computed values for each of its branches. +/// +/// \param F The function to query. +/// +/// \returns true if \p F was modified. Returns false, otherwise. +bool SampleFunctionProfile::emitAnnotations(Function &F, DominatorTree *DomTree, + PostDominatorTree *PostDomTree, + LoopInfo *Loops) { + bool Changed = false; -bool SampleProfileLoader::runOnFunction(Function &F) { - return Profiler->emitAnnotations(F); + // Initialize invariants used during computation and propagation. + HeaderLineno = getFunctionLoc(F); + if (HeaderLineno == 0) + return false; + + DEBUG(dbgs() << "Line number for the first instruction in " << F.getName() + << ": " << HeaderLineno << "\n"); + DT = DomTree; + PDT = PostDomTree; + LI = Loops; + Ctx = &F.getParent()->getContext(); + + // Compute basic block weights. + Changed |= computeBlockWeights(F); + + if (Changed) { + // Find equivalence classes. + findEquivalenceClasses(F); + + // Propagate weights to all edges. + propagateWeights(F); + } + + return Changed; } +char SampleProfileLoader::ID = 0; +INITIALIZE_PASS_BEGIN(SampleProfileLoader, "sample-profile", + "Sample Profile loader", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(AddDiscriminators) +INITIALIZE_PASS_END(SampleProfileLoader, "sample-profile", + "Sample Profile loader", false, false) + bool SampleProfileLoader::doInitialization(Module &M) { - Profiler.reset(new SampleProfile(Filename)); - Profiler->loadText(); + Profiler.reset(new SampleModuleProfile(M, Filename)); + ProfileIsValid = Profiler->loadText(); return true; } @@ -477,3 +1087,15 @@ FunctionPass *llvm::createSampleProfileLoaderPass() { FunctionPass *llvm::createSampleProfileLoaderPass(StringRef Name) { return new SampleProfileLoader(Name); } + +bool SampleProfileLoader::runOnFunction(Function &F) { + if (!ProfileIsValid) + return false; + DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + PostDominatorTree *PDT = &getAnalysis<PostDominatorTree>(); + LoopInfo *LI = &getAnalysis<LoopInfo>(); + SampleFunctionProfile &FunctionProfile = Profiler->getProfile(F); + if (!FunctionProfile.empty()) + return FunctionProfile.emitAnnotations(F, DT, PDT, LI); + return false; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp index 857597e..de724d4 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -17,8 +17,8 @@ #include "llvm-c/Initialization.h" #include "llvm-c/Transforms/Scalar.h" #include "llvm/Analysis/Passes.h" -#include "llvm/Analysis/Verifier.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" #include "llvm/PassManager.h" @@ -29,11 +29,12 @@ using namespace llvm; void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCEPass(Registry); initializeSampleProfileLoaderPass(Registry); - initializeCodeGenPreparePass(Registry); + initializeConstantHoistingPass(Registry); initializeConstantPropagationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); initializeDCEPass(Registry); initializeDeadInstEliminationPass(Registry); + initializeScalarizerPass(Registry); initializeDSEPass(Registry); initializeGVNPass(Registry); initializeEarlyCSEPass(Registry); @@ -51,6 +52,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLowerAtomicPass(Registry); initializeLowerExpectIntrinsicPass(Registry); initializeMemCpyOptPass(Registry); + initializeMergedLoadStoreMotionPass(Registry); initializePartiallyInlineLibCallsPass(Registry); initializeReassociatePass(Registry); initializeRegToMemPass(Registry); @@ -63,6 +65,8 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeStructurizeCFGPass(Registry); initializeSinkingPass(Registry); initializeTailCallElimPass(Registry); + initializeSeparateConstOffsetFromGEPPass(Registry); + initializeLoadCombinePass(Registry); } void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { @@ -81,10 +85,18 @@ void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createDeadStoreEliminationPass()); } +void LLVMAddScalarizerPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createScalarizerPass()); +} + void LLVMAddGVNPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createGVNPass()); } +void LLVMAddMergedLoadStoreMotionPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createMergedLoadStoreMotionPass()); +} + void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createIndVarSimplifyPass()); } @@ -176,6 +188,7 @@ void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) { void LLVMAddVerifierPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createVerifierPass()); + // FIXME: should this also add createDebugInfoVerifierPass()? } void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) { diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 57b290e..e2a24a7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -19,20 +19,21 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "scalarrepl" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/DIBuilder.h" -#include "llvm/DebugInfo.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -41,10 +42,8 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/Pass.h" -#include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" @@ -52,6 +51,8 @@ #include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; +#define DEBUG_TYPE "scalarrepl" + STATISTIC(NumReplaced, "Number of allocas broken up"); STATISTIC(NumPromoted, "Number of allocas promoted"); STATISTIC(NumAdjusted, "Number of scalar allocas adjusted to allow promotion"); @@ -80,14 +81,14 @@ namespace { ScalarLoadThreshold = SLT; } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; bool performScalarRepl(Function &F); bool performPromotion(Function &F); private: bool HasDomTree; - DataLayout *TD; + const DataLayout *DL; /// DeadInsts - Keep track of instructions we have made dead, so that /// we can remove them after we are done working. @@ -195,8 +196,8 @@ namespace { // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DominatorTree>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); } }; @@ -212,7 +213,7 @@ namespace { // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } }; @@ -224,7 +225,7 @@ char SROA_SSAUp::ID = 0; INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl", "Scalar Replacement of Aggregates (DT)", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(SROA_DT, "scalarrepl", "Scalar Replacement of Aggregates (DT)", false, false) @@ -258,7 +259,7 @@ namespace { class ConvertToScalarInfo { /// AllocaSize - The size of the alloca being considered in bytes. unsigned AllocaSize; - const DataLayout &TD; + const DataLayout &DL; unsigned ScalarLoadThreshold; /// IsNotTrivial - This is set to true if there is some access to the object @@ -301,10 +302,10 @@ class ConvertToScalarInfo { bool HadDynamicAccess; public: - explicit ConvertToScalarInfo(unsigned Size, const DataLayout &td, + explicit ConvertToScalarInfo(unsigned Size, const DataLayout &DL, unsigned SLT) - : AllocaSize(Size), TD(td), ScalarLoadThreshold(SLT), IsNotTrivial(false), - ScalarKind(Unknown), VectorTy(0), HadNonMemTransferAccess(false), + : AllocaSize(Size), DL(DL), ScalarLoadThreshold(SLT), IsNotTrivial(false), + ScalarKind(Unknown), VectorTy(nullptr), HadNonMemTransferAccess(false), HadDynamicAccess(false) { } AllocaInst *TryConvert(AllocaInst *AI); @@ -332,8 +333,8 @@ private: AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { // If we can't convert this scalar, or if mem2reg can trivially do it, bail // out. - if (!CanConvertToScalar(AI, 0, 0) || !IsNotTrivial) - return 0; + if (!CanConvertToScalar(AI, 0, nullptr) || !IsNotTrivial) + return nullptr; // If an alloca has only memset / memcpy uses, it may still have an Unknown // ScalarKind. Treat it as an Integer below. @@ -361,23 +362,24 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { // Do not convert to scalar integer if the alloca size exceeds the // scalar load threshold. if (BitWidth > ScalarLoadThreshold) - return 0; + return nullptr; if ((ScalarKind == ImplicitVector || ScalarKind == Integer) && - !HadNonMemTransferAccess && !TD.fitsInLegalInteger(BitWidth)) - return 0; + !HadNonMemTransferAccess && !DL.fitsInLegalInteger(BitWidth)) + return nullptr; // Dynamic accesses on integers aren't yet supported. They need us to shift // by a dynamic amount which could be difficult to work out as we might not // know whether to use a left or right shift. if (ScalarKind == Integer && HadDynamicAccess) - return 0; + return nullptr; DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n"); // Create and insert the integer alloca. NewTy = IntegerType::get(AI->getContext(), BitWidth); } - AllocaInst *NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin()); - ConvertUsesToScalar(AI, NewAI, 0, 0); + AllocaInst *NewAI = new AllocaInst(NewTy, nullptr, "", + AI->getParent()->begin()); + ConvertUsesToScalar(AI, NewAI, 0, nullptr); return NewAI; } @@ -466,10 +468,10 @@ bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy, /// SawVec flag. bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx) { - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { - Instruction *User = cast<Instruction>(*UI); + for (User *U : V->users()) { + Instruction *UI = cast<Instruction>(U); - if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + if (LoadInst *LI = dyn_cast<LoadInst>(UI)) { // Don't break volatile loads. if (!LI->isSimple()) return false; @@ -481,7 +483,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, continue; } - if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + if (StoreInst *SI = dyn_cast<StoreInst>(UI)) { // Storing the pointer, not into the value? if (SI->getOperand(0) == V || !SI->isSimple()) return false; // Don't touch MMX operations. @@ -492,7 +494,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, continue; } - if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) { + if (BitCastInst *BCI = dyn_cast<BitCastInst>(UI)) { if (!onlyUsedByLifetimeMarkers(BCI)) IsNotTrivial = true; // Can't be mem2reg'd. if (!CanConvertToScalar(BCI, Offset, NonConstantIdx)) @@ -500,7 +502,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, continue; } - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UI)) { // If this is a GEP with a variable indices, we can't handle it. PointerType* PtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType()); if (!PtrTy) @@ -508,7 +510,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, // Compute the offset that this GEP adds to the pointer. SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); - Value *GEPNonConstantIdx = 0; + Value *GEPNonConstantIdx = nullptr; if (!GEP->hasAllConstantIndices()) { if (!isa<VectorType>(PtrTy->getElementType())) return false; @@ -520,7 +522,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, HadDynamicAccess = true; } else GEPNonConstantIdx = NonConstantIdx; - uint64_t GEPOffset = TD.getIndexedOffset(PtrTy, + uint64_t GEPOffset = DL.getIndexedOffset(PtrTy, Indices); // See if all uses can be converted. if (!CanConvertToScalar(GEP, Offset+GEPOffset, GEPNonConstantIdx)) @@ -532,7 +534,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, // If this is a constant sized memset of a constant value (e.g. 0) we can // handle it. - if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { + if (MemSetInst *MSI = dyn_cast<MemSetInst>(UI)) { // Store to dynamic index. if (NonConstantIdx) return false; @@ -559,12 +561,12 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, // If this is a memcpy or memmove into or out of the whole allocation, we // can handle it like a load or store of the scalar type. - if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(UI)) { // Store to dynamic index. if (NonConstantIdx) return false; ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength()); - if (Len == 0 || Len->getZExtValue() != AllocaSize || Offset != 0) + if (!Len || Len->getZExtValue() != AllocaSize || Offset != 0) return false; IsNotTrivial = true; // Can't be mem2reg'd. @@ -572,7 +574,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, } // If this is a lifetime intrinsic, we can handle it. - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(UI)) { if (II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end) { continue; @@ -597,7 +599,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset, Value* NonConstantIdx) { while (!Ptr->use_empty()) { - Instruction *User = cast<Instruction>(Ptr->use_back()); + Instruction *User = cast<Instruction>(Ptr->user_back()); if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) { ConvertUsesToScalar(CI, NewAI, Offset, NonConstantIdx); @@ -608,14 +610,14 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { // Compute the offset that this GEP adds to the pointer. SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); - Value* GEPNonConstantIdx = 0; + Value* GEPNonConstantIdx = nullptr; if (!GEP->hasAllConstantIndices()) { assert(!NonConstantIdx && "Dynamic GEP reading from dynamic GEP unsupported"); GEPNonConstantIdx = Indices.pop_back_val(); } else GEPNonConstantIdx = NonConstantIdx; - uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(), + uint64_t GEPOffset = DL.getIndexedOffset(GEP->getPointerOperandType(), Indices); ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8, GEPNonConstantIdx); GEP->eraseFromParent(); @@ -671,7 +673,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); Value *New = ConvertScalar_InsertValue( ConstantInt::get(User->getContext(), APVal), - Old, Offset, 0, Builder); + Old, Offset, nullptr, Builder); Builder.CreateStore(New, NewAI); // If the load we just inserted is now dead, then the memset overwrote @@ -692,9 +694,9 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, // If the source and destination are both to the same alloca, then this is // a noop copy-to-self, just delete it. Otherwise, emit a load and store // as appropriate. - AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, &TD, 0)); + AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, &DL, 0)); - if (GetUnderlyingObject(MTI->getSource(), &TD, 0) != OrigAI) { + if (GetUnderlyingObject(MTI->getSource(), &DL, 0) != OrigAI) { // Dest must be OrigAI, change this to be a load from the original // pointer (bitcasted), then a store to our new alloca. assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?"); @@ -710,7 +712,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval"); SrcVal->setAlignment(MTI->getAlignment()); Builder.CreateStore(SrcVal, NewAI); - } else if (GetUnderlyingObject(MTI->getDest(), &TD, 0) != OrigAI) { + } else if (GetUnderlyingObject(MTI->getDest(), &DL, 0) != OrigAI) { // Src must be OrigAI, change this to be a load from NewAI then a store // through the original dest pointer (bitcasted). assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?"); @@ -770,15 +772,15 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, // If the result alloca is a vector type, this is either an element // access or a bitcast to another vector type of the same size. if (VectorType *VTy = dyn_cast<VectorType>(FromType)) { - unsigned FromTypeSize = TD.getTypeAllocSize(FromType); - unsigned ToTypeSize = TD.getTypeAllocSize(ToType); + unsigned FromTypeSize = DL.getTypeAllocSize(FromType); + unsigned ToTypeSize = DL.getTypeAllocSize(ToType); if (FromTypeSize == ToTypeSize) return Builder.CreateBitCast(FromVal, ToType); // Otherwise it must be an element access. unsigned Elt = 0; if (Offset) { - unsigned EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType()); + unsigned EltSize = DL.getTypeAllocSizeInBits(VTy->getElementType()); Elt = Offset/EltSize; assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); } @@ -804,12 +806,12 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, if (StructType *ST = dyn_cast<StructType>(ToType)) { assert(!NonConstantIdx && "Dynamic indexing into struct types not supported"); - const StructLayout &Layout = *TD.getStructLayout(ST); + const StructLayout &Layout = *DL.getStructLayout(ST); Value *Res = UndefValue::get(ST); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i), Offset+Layout.getElementOffsetInBits(i), - 0, Builder); + nullptr, Builder); Res = Builder.CreateInsertValue(Res, Elt, i); } return Res; @@ -818,11 +820,12 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) { assert(!NonConstantIdx && "Dynamic indexing into array types not supported"); - uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); + uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType()); Value *Res = UndefValue::get(AT); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(), - Offset+i*EltSize, 0, Builder); + Offset+i*EltSize, nullptr, + Builder); Res = Builder.CreateInsertValue(Res, Elt, i); } return Res; @@ -834,12 +837,12 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, // If this is a big-endian system and the load is narrower than the // full alloca type, we need to do a shift to get the right bits. int ShAmt = 0; - if (TD.isBigEndian()) { + if (DL.isBigEndian()) { // On big-endian machines, the lowest bit is stored at the bit offset // from the pointer given by getTypeStoreSizeInBits. This matters for // integers with a bitwidth that is not a multiple of 8. - ShAmt = TD.getTypeStoreSizeInBits(NTy) - - TD.getTypeStoreSizeInBits(ToType) - Offset; + ShAmt = DL.getTypeStoreSizeInBits(NTy) - + DL.getTypeStoreSizeInBits(ToType) - Offset; } else { ShAmt = Offset; } @@ -855,7 +858,7 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, ConstantInt::get(FromVal->getType(), -ShAmt)); // Finally, unconditionally truncate the integer to the right width. - unsigned LIBitWidth = TD.getTypeSizeInBits(ToType); + unsigned LIBitWidth = DL.getTypeSizeInBits(ToType); if (LIBitWidth < NTy->getBitWidth()) FromVal = Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(), @@ -902,8 +905,8 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, LLVMContext &Context = Old->getContext(); if (VectorType *VTy = dyn_cast<VectorType>(AllocaType)) { - uint64_t VecSize = TD.getTypeAllocSizeInBits(VTy); - uint64_t ValSize = TD.getTypeAllocSizeInBits(SV->getType()); + uint64_t VecSize = DL.getTypeAllocSizeInBits(VTy); + uint64_t ValSize = DL.getTypeAllocSizeInBits(SV->getType()); // Changing the whole vector with memset or with an access of a different // vector type? @@ -914,7 +917,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, Type *EltTy = VTy->getElementType(); if (SV->getType() != EltTy) SV = Builder.CreateBitCast(SV, EltTy); - uint64_t EltSize = TD.getTypeAllocSizeInBits(EltTy); + uint64_t EltSize = DL.getTypeAllocSizeInBits(EltTy); unsigned Elt = Offset/EltSize; Value *Idx; if (NonConstantIdx) { @@ -933,12 +936,12 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, if (StructType *ST = dyn_cast<StructType>(SV->getType())) { assert(!NonConstantIdx && "Dynamic indexing into struct types not supported"); - const StructLayout &Layout = *TD.getStructLayout(ST); + const StructLayout &Layout = *DL.getStructLayout(ST); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { Value *Elt = Builder.CreateExtractValue(SV, i); Old = ConvertScalar_InsertValue(Elt, Old, Offset+Layout.getElementOffsetInBits(i), - 0, Builder); + nullptr, Builder); } return Old; } @@ -946,24 +949,25 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) { assert(!NonConstantIdx && "Dynamic indexing into array types not supported"); - uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); + uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType()); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { Value *Elt = Builder.CreateExtractValue(SV, i); - Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, 0, Builder); + Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, nullptr, + Builder); } return Old; } // If SV is a float, convert it to the appropriate integer type. // If it is a pointer, do the same. - unsigned SrcWidth = TD.getTypeSizeInBits(SV->getType()); - unsigned DestWidth = TD.getTypeSizeInBits(AllocaType); - unsigned SrcStoreWidth = TD.getTypeStoreSizeInBits(SV->getType()); - unsigned DestStoreWidth = TD.getTypeStoreSizeInBits(AllocaType); + unsigned SrcWidth = DL.getTypeSizeInBits(SV->getType()); + unsigned DestWidth = DL.getTypeSizeInBits(AllocaType); + unsigned SrcStoreWidth = DL.getTypeStoreSizeInBits(SV->getType()); + unsigned DestStoreWidth = DL.getTypeStoreSizeInBits(AllocaType); if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy()) SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth)); else if (SV->getType()->isPointerTy()) - SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getType())); + SV = Builder.CreatePtrToInt(SV, DL.getIntPtrType(SV->getType())); // Zero extend or truncate the value if needed. if (SV->getType() != AllocaType) { @@ -982,7 +986,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, // If this is a big-endian system and the store is narrower than the // full alloca type, we need to do a shift to get the right bits. int ShAmt = 0; - if (TD.isBigEndian()) { + if (DL.isBigEndian()) { // On big-endian machines, the lowest bit is stored at the bit offset // from the pointer given by getTypeStoreSizeInBits. This matters for // integers with a bitwidth that is not a multiple of 8. @@ -1020,7 +1024,11 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, bool SROA::runOnFunction(Function &F) { - TD = getAnalysisIfAvailable<DataLayout>(); + if (skipOptnoneFunction(F)) + return false; + + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; bool Changed = performPromotion(F); @@ -1028,7 +1036,7 @@ bool SROA::runOnFunction(Function &F) { // theoretically needs to. It should be refactored in order to support // target-independent IR. Until this is done, just skip the actual // scalar-replacement portion of this pass. - if (!TD) return Changed; + if (!DL) return Changed; while (1) { bool LocalChange = performScalarRepl(F); @@ -1050,17 +1058,16 @@ class AllocaPromoter : public LoadAndStorePromoter { public: AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, DIBuilder *DB) - : LoadAndStorePromoter(Insts, S), AI(0), DIB(DB) {} + : LoadAndStorePromoter(Insts, S), AI(nullptr), DIB(DB) {} void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) { // Remember which alloca we're promoting (for isInstInList). this->AI = AI; if (MDNode *DebugNode = MDNode::getIfExists(AI->getContext(), AI)) { - for (Value::use_iterator UI = DebugNode->use_begin(), - E = DebugNode->use_end(); UI != E; ++UI) - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) + for (User *U : DebugNode->users()) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) DDIs.push_back(DDI); - else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI)) + else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) DVIs.push_back(DVI); } @@ -1078,14 +1085,14 @@ public: } } - virtual bool isInstInList(Instruction *I, - const SmallVectorImpl<Instruction*> &Insts) const { + bool isInstInList(Instruction *I, + const SmallVectorImpl<Instruction*> &Insts) const override { if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getOperand(0) == AI; return cast<StoreInst>(I)->getPointerOperand() == AI; } - virtual void updateDebugInfo(Instruction *Inst) const { + void updateDebugInfo(Instruction *Inst) const override { for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; @@ -1097,7 +1104,7 @@ public: for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; - Value *Arg = NULL; + Value *Arg = nullptr; if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { // If an argument is zero extended then use argument directly. The ZExt // may be zapped by an optimization pass in future. @@ -1134,22 +1141,21 @@ public: /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. -static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *TD) { - bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(); - bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(); +static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *DL) { + bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(DL); + bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(DL); - for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end(); - UI != UE; ++UI) { - LoadInst *LI = dyn_cast<LoadInst>(*UI); - if (LI == 0 || !LI->isSimple()) return false; + for (User *U : SI->users()) { + LoadInst *LI = dyn_cast<LoadInst>(U); + if (!LI || !LI->isSimple()) return false; // Both operands to the select need to be dereferencable, either absolutely // (e.g. allocas) or at this point because we can see other accesses to it. if (!TDerefable && !isSafeToLoadUnconditionally(SI->getTrueValue(), LI, - LI->getAlignment(), TD)) + LI->getAlignment(), DL)) return false; if (!FDerefable && !isSafeToLoadUnconditionally(SI->getFalseValue(), LI, - LI->getAlignment(), TD)) + LI->getAlignment(), DL)) return false; } @@ -1172,17 +1178,16 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *TD) { /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. -static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *TD) { +static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) { // For now, we can only do this promotion if the load is in the same block as // the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. // TODO: Allow stores. BasicBlock *BB = PN->getParent(); unsigned MaxAlign = 0; - for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end(); - UI != UE; ++UI) { - LoadInst *LI = dyn_cast<LoadInst>(*UI); - if (LI == 0 || !LI->isSimple()) return false; + for (User *U : PN->users()) { + LoadInst *LI = dyn_cast<LoadInst>(U); + if (!LI || !LI->isSimple()) return false; // For now we only allow loads in the same block as the PHI. This is a // common case that happens when instcombine merges two loads through a PHI. @@ -1221,8 +1226,8 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *TD) { // If this pointer is always safe to load, or if we can prove that there is // already a load in the block, then we can move the load to the pred block. - if (InVal->isDereferenceablePointer() || - isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, TD)) + if (InVal->isDereferenceablePointer(DL) || + isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, DL)) continue; return false; @@ -1236,13 +1241,10 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *TD) { /// direct (non-volatile) loads and stores to it. If the alloca is close but /// not quite there, this will transform the code to allow promotion. As such, /// it is a non-pure predicate. -static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { +static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) { SetVector<Instruction*, SmallVector<Instruction*, 4>, SmallPtrSet<Instruction*, 4> > InstsToRewrite; - - for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); - UI != UE; ++UI) { - User *U = *UI; + for (User *U : AI->users()) { if (LoadInst *LI = dyn_cast<LoadInst>(U)) { if (!LI->isSimple()) return false; @@ -1265,12 +1267,12 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { // This is very rare and we just scrambled the use list of AI, start // over completely. - return tryToMakeAllocaBePromotable(AI, TD); + return tryToMakeAllocaBePromotable(AI, DL); } // If it is safe to turn "load (select c, AI, ptr)" into a select of two // loads, then we can transform this by rewriting the select. - if (!isSafeSelectToSpeculate(SI, TD)) + if (!isSafeSelectToSpeculate(SI, DL)) return false; InstsToRewrite.insert(SI); @@ -1285,7 +1287,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads // in the pred blocks, then we can transform this by rewriting the PHI. - if (!isSafePHIToSpeculate(PN, TD)) + if (!isSafePHIToSpeculate(PN, DL)) return false; InstsToRewrite.insert(PN); @@ -1312,12 +1314,9 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) { if (BitCastInst *BCI = dyn_cast<BitCastInst>(InstsToRewrite[i])) { // This could only be a bitcast used by nothing but lifetime intrinsics. - for (BitCastInst::use_iterator I = BCI->use_begin(), E = BCI->use_end(); - I != E;) { - Use &U = I.getUse(); - ++I; - cast<Instruction>(U.getUser())->eraseFromParent(); - } + for (BitCastInst::user_iterator I = BCI->user_begin(), E = BCI->user_end(); + I != E;) + cast<Instruction>(*I++)->eraseFromParent(); BCI->eraseFromParent(); continue; } @@ -1326,7 +1325,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { // Selects in InstsToRewrite only have load uses. Rewrite each as two // loads with a new select. while (!SI->use_empty()) { - LoadInst *LI = cast<LoadInst>(SI->use_back()); + LoadInst *LI = cast<LoadInst>(SI->user_back()); IRBuilder<> Builder(LI); LoadInst *TrueLoad = @@ -1367,13 +1366,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { // Get the TBAA tag and alignment to use from one of the loads. It doesn't // matter which one we get and if any differ, it doesn't matter. - LoadInst *SomeLoad = cast<LoadInst>(PN->use_back()); + LoadInst *SomeLoad = cast<LoadInst>(PN->user_back()); MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); unsigned Align = SomeLoad->getAlignment(); // Rewrite all loads of the PN to use the new PHI. while (!PN->use_empty()) { - LoadInst *LI = cast<LoadInst>(PN->use_back()); + LoadInst *LI = cast<LoadInst>(PN->user_back()); LI->replaceAllUsesWith(NewPN); LI->eraseFromParent(); } @@ -1385,7 +1384,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *Pred = PN->getIncomingBlock(i); LoadInst *&Load = InsertedLoads[Pred]; - if (Load == 0) { + if (!Load) { Load = new LoadInst(PN->getIncomingValue(i), PN->getName() + "." + Pred->getName(), Pred->getTerminator()); @@ -1405,9 +1404,9 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { bool SROA::performPromotion(Function &F) { std::vector<AllocaInst*> Allocas; - DominatorTree *DT = 0; + DominatorTree *DT = nullptr; if (HasDomTree) - DT = &getAnalysis<DominatorTree>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function DIBuilder DIB(*F.getParent()); @@ -1420,7 +1419,7 @@ bool SROA::performPromotion(Function &F) { // the entry node for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca? - if (tryToMakeAllocaBePromotable(AI, TD)) + if (tryToMakeAllocaBePromotable(AI, DL)) Allocas.push_back(AI); if (Allocas.empty()) break; @@ -1433,9 +1432,8 @@ bool SROA::performPromotion(Function &F) { AllocaInst *AI = Allocas[i]; // Build list of instructions to promote. - for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); - UI != E; ++UI) - Insts.push_back(cast<Instruction>(*UI)); + for (User *U : AI->users()) + Insts.push_back(cast<Instruction>(U)); AllocaPromoter(Insts, SSA, &DIB).run(AI, Insts); Insts.clear(); } @@ -1496,7 +1494,7 @@ bool SROA::performScalarRepl(Function &F) { // transform the allocation instruction if it is an array allocation // (allocations OF arrays are ok though), and an allocation of a scalar // value cannot be decomposed at all. - uint64_t AllocaSize = TD->getTypeAllocSize(AI->getAllocatedType()); + uint64_t AllocaSize = DL->getTypeAllocSize(AI->getAllocatedType()); // Do not promote [0 x %struct]. if (AllocaSize == 0) continue; @@ -1520,7 +1518,7 @@ bool SROA::performScalarRepl(Function &F) { // that we can't just check based on the type: the alloca may be of an i32 // but that has pointer arithmetic to set byte 3 of it or something. if (AllocaInst *NewAI = ConvertToScalarInfo( - (unsigned)AllocaSize, *TD, ScalarLoadThreshold).TryConvert(AI)) { + (unsigned)AllocaSize, *DL, ScalarLoadThreshold).TryConvert(AI)) { NewAI->takeName(AI); AI->eraseFromParent(); ++NumConverted; @@ -1543,7 +1541,7 @@ void SROA::DoScalarReplacement(AllocaInst *AI, if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) { ElementAllocas.reserve(ST->getNumContainedTypes()); for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) { - AllocaInst *NA = new AllocaInst(ST->getContainedType(i), 0, + AllocaInst *NA = new AllocaInst(ST->getContainedType(i), nullptr, AI->getAlignment(), AI->getName() + "." + Twine(i), AI); ElementAllocas.push_back(NA); @@ -1554,7 +1552,7 @@ void SROA::DoScalarReplacement(AllocaInst *AI, ElementAllocas.reserve(AT->getNumElements()); Type *ElTy = AT->getElementType(); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { - AllocaInst *NA = new AllocaInst(ElTy, 0, AI->getAlignment(), + AllocaInst *NA = new AllocaInst(ElTy, nullptr, AI->getAlignment(), AI->getName() + "." + Twine(i), AI); ElementAllocas.push_back(NA); WorkList.push_back(NA); // Add to worklist for recursive processing @@ -1583,7 +1581,7 @@ void SROA::DeleteDeadInstructions() { // Zero out the operand and see if it becomes trivially dead. // (But, don't add allocas to the dead instruction list -- they are // already on the worklist and will be deleted separately.) - *OI = 0; + *OI = nullptr; if (isInstructionTriviallyDead(U) && !isa<AllocaInst>(U)) DeadInsts.push_back(U); } @@ -1598,8 +1596,8 @@ void SROA::DeleteDeadInstructions() { /// referenced by this instruction. void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info) { - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) { - Instruction *User = cast<Instruction>(*UI); + for (Use &U : I->uses()) { + Instruction *User = cast<Instruction>(U.getUser()); if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { isSafeForScalarRepl(BC, Offset, Info); @@ -1610,19 +1608,17 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, isSafeForScalarRepl(GEPI, GEPOffset, Info); } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) { ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength()); - if (Length == 0) - return MarkUnsafe(Info, User); - if (Length->isNegative()) + if (!Length || Length->isNegative()) return MarkUnsafe(Info, User); - isSafeMemAccess(Offset, Length->getZExtValue(), 0, - UI.getOperandNo() == 0, Info, MI, + isSafeMemAccess(Offset, Length->getZExtValue(), nullptr, + U.getOperandNo() == 0, Info, MI, true /*AllowWholeAccess*/); } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) { if (!LI->isSimple()) return MarkUnsafe(Info, User); Type *LIType = LI->getType(); - isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), + isSafeMemAccess(Offset, DL->getTypeAllocSize(LIType), LIType, false, Info, LI, true /*AllowWholeAccess*/); Info.hasALoadOrStore = true; @@ -1632,7 +1628,7 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, return MarkUnsafe(Info, User); Type *SIType = SI->getOperand(0)->getType(); - isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType), + isSafeMemAccess(Offset, DL->getTypeAllocSize(SIType), SIType, true, Info, SI, true /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { @@ -1665,39 +1661,39 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, if (!Info.CheckedPHIs.insert(PN)) return; - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) { - Instruction *User = cast<Instruction>(*UI); + for (User *U : I->users()) { + Instruction *UI = cast<Instruction>(U); - if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { + if (BitCastInst *BC = dyn_cast<BitCastInst>(UI)) { isSafePHISelectUseForScalarRepl(BC, Offset, Info); - } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { + } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(UI)) { // Only allow "bitcast" GEPs for simplicity. We could generalize this, // but would have to prove that we're staying inside of an element being // promoted. if (!GEPI->hasAllZeroIndices()) - return MarkUnsafe(Info, User); + return MarkUnsafe(Info, UI); isSafePHISelectUseForScalarRepl(GEPI, Offset, Info); - } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + } else if (LoadInst *LI = dyn_cast<LoadInst>(UI)) { if (!LI->isSimple()) - return MarkUnsafe(Info, User); + return MarkUnsafe(Info, UI); Type *LIType = LI->getType(); - isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), + isSafeMemAccess(Offset, DL->getTypeAllocSize(LIType), LIType, false, Info, LI, false /*AllowWholeAccess*/); Info.hasALoadOrStore = true; - } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + } else if (StoreInst *SI = dyn_cast<StoreInst>(UI)) { // Store is ok if storing INTO the pointer, not storing the pointer if (!SI->isSimple() || SI->getOperand(0) == I) - return MarkUnsafe(Info, User); + return MarkUnsafe(Info, UI); Type *SIType = SI->getOperand(0)->getType(); - isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType), + isSafeMemAccess(Offset, DL->getTypeAllocSize(SIType), SIType, true, Info, SI, false /*AllowWholeAccess*/); Info.hasALoadOrStore = true; - } else if (isa<PHINode>(User) || isa<SelectInst>(User)) { - isSafePHISelectUseForScalarRepl(User, Offset, Info); + } else if (isa<PHINode>(UI) || isa<SelectInst>(UI)) { + isSafePHISelectUseForScalarRepl(UI, Offset, Info); } else { - return MarkUnsafe(Info, User); + return MarkUnsafe(Info, UI); } if (Info.isUnsafe) return; } @@ -1731,12 +1727,12 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI, // Compute the offset due to this GEP and check if the alloca has a // component element at that offset. SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); - // If this GEP is non constant then the last operand must have been a + // If this GEP is non-constant then the last operand must have been a // dynamic index into a vector. Pop this now as it has no impact on the // constant part of the offset. if (NonConstant) Indices.pop_back(); - Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices); + Offset += DL->getIndexedOffset(GEPI->getPointerOperandType(), Indices); if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, NonConstantIdxSize)) MarkUnsafe(Info, GEPI); @@ -1750,12 +1746,12 @@ static bool isHomogeneousAggregate(Type *T, unsigned &NumElts, Type *&EltTy) { if (ArrayType *AT = dyn_cast<ArrayType>(T)) { NumElts = AT->getNumElements(); - EltTy = (NumElts == 0 ? 0 : AT->getElementType()); + EltTy = (NumElts == 0 ? nullptr : AT->getElementType()); return true; } if (StructType *ST = dyn_cast<StructType>(T)) { NumElts = ST->getNumContainedTypes(); - EltTy = (NumElts == 0 ? 0 : ST->getContainedType(0)); + EltTy = (NumElts == 0 ? nullptr : ST->getContainedType(0)); for (unsigned n = 1; n < NumElts; ++n) { if (ST->getContainedType(n) != EltTy) return false; @@ -1795,7 +1791,7 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize, bool AllowWholeAccess) { // Check if this is a load/store of the entire alloca. if (Offset == 0 && AllowWholeAccess && - MemSize == TD->getTypeAllocSize(Info.AI->getAllocatedType())) { + MemSize == DL->getTypeAllocSize(Info.AI->getAllocatedType())) { // This can be safe for MemIntrinsics (where MemOpType is 0) and integer // loads/stores (which are essentially the same as the MemIntrinsics with // regard to copying padding between elements). But, if an alloca is @@ -1832,20 +1828,20 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) { Type *EltTy; uint64_t EltSize; if (StructType *ST = dyn_cast<StructType>(T)) { - const StructLayout *Layout = TD->getStructLayout(ST); + const StructLayout *Layout = DL->getStructLayout(ST); unsigned EltIdx = Layout->getElementContainingOffset(Offset); EltTy = ST->getContainedType(EltIdx); - EltSize = TD->getTypeAllocSize(EltTy); + EltSize = DL->getTypeAllocSize(EltTy); Offset -= Layout->getElementOffset(EltIdx); } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) { EltTy = AT->getElementType(); - EltSize = TD->getTypeAllocSize(EltTy); + EltSize = DL->getTypeAllocSize(EltTy); if (Offset >= AT->getNumElements() * EltSize) return false; Offset %= EltSize; } else if (VectorType *VT = dyn_cast<VectorType>(T)) { EltTy = VT->getElementType(); - EltSize = TD->getTypeAllocSize(EltTy); + EltSize = DL->getTypeAllocSize(EltTy); if (Offset >= VT->getNumElements() * EltSize) return false; Offset %= EltSize; @@ -1867,8 +1863,8 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) { void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, SmallVectorImpl<AllocaInst *> &NewElts) { for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) { - Use &TheUse = UI.getUse(); - Instruction *User = cast<Instruction>(*UI++); + Use &TheUse = *UI++; + Instruction *User = cast<Instruction>(TheUse.getUser()); if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { RewriteBitCast(BC, AI, Offset, NewElts); @@ -1884,7 +1880,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength()); uint64_t MemSize = Length->getZExtValue(); if (Offset == 0 && - MemSize == TD->getTypeAllocSize(AI->getAllocatedType())) + MemSize == DL->getTypeAllocSize(AI->getAllocatedType())) RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts); // Otherwise the intrinsic can only touch a single element and the // address operand will be updated, so nothing else needs to be done. @@ -1920,8 +1916,8 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, LI->replaceAllUsesWith(Insert); DeadInsts.push_back(LI); } else if (LIType->isIntegerTy() && - TD->getTypeAllocSize(LIType) == - TD->getTypeAllocSize(AI->getAllocatedType())) { + DL->getTypeAllocSize(LIType) == + DL->getTypeAllocSize(AI->getAllocatedType())) { // If this is a load of the entire alloca to an integer, rewrite it. RewriteLoadUserOfWholeAlloca(LI, AI, NewElts); } @@ -1947,8 +1943,8 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, } DeadInsts.push_back(SI); } else if (SIType->isIntegerTy() && - TD->getTypeAllocSize(SIType) == - TD->getTypeAllocSize(AI->getAllocatedType())) { + DL->getTypeAllocSize(SIType) == + DL->getTypeAllocSize(AI->getAllocatedType())) { // If this is a store of the entire alloca from an integer, rewrite it. RewriteStoreUserOfWholeAlloca(SI, AI, NewElts); } @@ -2010,7 +2006,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy) { uint64_t Idx = 0; if (StructType *ST = dyn_cast<StructType>(T)) { - const StructLayout *Layout = TD->getStructLayout(ST); + const StructLayout *Layout = DL->getStructLayout(ST); Idx = Layout->getElementContainingOffset(Offset); T = ST->getContainedType(Idx); Offset -= Layout->getElementOffset(Idx); @@ -2018,7 +2014,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, return Idx; } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) { T = AT->getElementType(); - uint64_t EltSize = TD->getTypeAllocSize(T); + uint64_t EltSize = DL->getTypeAllocSize(T); Idx = Offset / EltSize; Offset -= Idx * EltSize; IdxTy = Type::getInt64Ty(T->getContext()); @@ -2026,7 +2022,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, } VectorType *VT = cast<VectorType>(T); T = VT->getElementType(); - uint64_t EltSize = TD->getTypeAllocSize(T); + uint64_t EltSize = DL->getTypeAllocSize(T); Idx = Offset / EltSize; Offset -= Idx * EltSize; IdxTy = Type::getInt64Ty(T->getContext()); @@ -2044,10 +2040,10 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, // In this case, it must be the last GEP operand which is dynamic so keep that // aside until we've found the constant GEP offset then add it back in at the // end. - Value* NonConstantIdx = 0; + Value* NonConstantIdx = nullptr; if (!GEPI->hasAllConstantIndices()) NonConstantIdx = Indices.pop_back_val(); - Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices); + Offset += DL->getIndexedOffset(GEPI->getPointerOperandType(), Indices); RewriteForScalarRepl(GEPI, AI, Offset, NewElts); @@ -2114,11 +2110,12 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, if (NewOffset) { // Splice the first element and index 'NewOffset' bytes in. SROA will // split the alloca again later. - Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy()); + unsigned AS = AI->getType()->getAddressSpace(); + Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy(AS)); V = Builder.CreateGEP(V, Builder.getInt64(NewOffset)); IdxTy = NewElts[Idx]->getAllocatedType(); - uint64_t EltSize = TD->getTypeAllocSize(IdxTy) - NewOffset; + uint64_t EltSize = DL->getTypeAllocSize(IdxTy) - NewOffset; if (EltSize > Size) { EltSize = Size; Size = 0; @@ -2134,7 +2131,7 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, for (; Idx != NewElts.size() && Size; ++Idx) { IdxTy = NewElts[Idx]->getAllocatedType(); - uint64_t EltSize = TD->getTypeAllocSize(IdxTy); + uint64_t EltSize = DL->getTypeAllocSize(IdxTy); if (EltSize > Size) { EltSize = Size; Size = 0; @@ -2161,7 +2158,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, // appropriate type. The "Other" pointer is the pointer that goes to memory // that doesn't have anything to do with the alloca that we are promoting. For // memset, this Value* stays null. - Value *OtherPtr = 0; + Value *OtherPtr = nullptr; unsigned MemAlignment = MI->getAlignment(); if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { // memmove/memcopy if (Inst == MTI->getRawDest()) @@ -2213,7 +2210,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // If this is a memcpy/memmove, emit a GEP of the other element address. - Value *OtherElt = 0; + Value *OtherElt = nullptr; unsigned OtherEltAlign = MemAlignment; if (OtherPtr) { @@ -2226,10 +2223,10 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType()); Type *OtherTy = OtherPtrTy->getElementType(); if (StructType *ST = dyn_cast<StructType>(OtherTy)) { - EltOffset = TD->getStructLayout(ST)->getElementOffset(i); + EltOffset = DL->getStructLayout(ST)->getElementOffset(i); } else { Type *EltTy = cast<SequentialType>(OtherTy)->getElementType(); - EltOffset = TD->getTypeAllocSize(EltTy)*i; + EltOffset = DL->getTypeAllocSize(EltTy)*i; } // The alignment of the other pointer is the guaranteed alignment of the @@ -2270,7 +2267,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, Type *ValTy = EltTy->getScalarType(); // Construct an integer with the right value. - unsigned EltSize = TD->getTypeSizeInBits(ValTy); + unsigned EltSize = DL->getTypeSizeInBits(ValTy); APInt OneVal(EltSize, CI->getZExtValue()); APInt TotalVal(OneVal); // Set each byte. @@ -2300,7 +2297,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, // this element. } - unsigned EltSize = TD->getTypeAllocSize(EltTy); + unsigned EltSize = DL->getTypeAllocSize(EltTy); if (!EltSize) continue; @@ -2334,12 +2331,12 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, // and store the element value to the individual alloca. Value *SrcVal = SI->getOperand(0); Type *AllocaEltTy = AI->getAllocatedType(); - uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy); + uint64_t AllocaSizeBits = DL->getTypeAllocSizeInBits(AllocaEltTy); IRBuilder<> Builder(SI); // Handle tail padding by extending the operand - if (TD->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits) + if (DL->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits) SrcVal = Builder.CreateZExt(SrcVal, IntegerType::get(SI->getContext(), AllocaSizeBits)); @@ -2349,15 +2346,15 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, // There are two forms here: AI could be an array or struct. Both cases // have different ways to compute the element offset. if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { - const StructLayout *Layout = TD->getStructLayout(EltSTy); + const StructLayout *Layout = DL->getStructLayout(EltSTy); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // Get the number of bits to shift SrcVal to get the value. Type *FieldTy = EltSTy->getElementType(i); uint64_t Shift = Layout->getElementOffsetInBits(i); - if (TD->isBigEndian()) - Shift = AllocaSizeBits-Shift-TD->getTypeAllocSizeInBits(FieldTy); + if (DL->isBigEndian()) + Shift = AllocaSizeBits-Shift-DL->getTypeAllocSizeInBits(FieldTy); Value *EltVal = SrcVal; if (Shift) { @@ -2366,7 +2363,7 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, } // Truncate down to an integer of the right size. - uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy); + uint64_t FieldSizeBits = DL->getTypeSizeInBits(FieldTy); // Ignore zero sized fields like {}, they obviously contain no data. if (FieldSizeBits == 0) continue; @@ -2391,12 +2388,12 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, } else { ArrayType *ATy = cast<ArrayType>(AllocaEltTy); Type *ArrayEltTy = ATy->getElementType(); - uint64_t ElementOffset = TD->getTypeAllocSizeInBits(ArrayEltTy); - uint64_t ElementSizeBits = TD->getTypeSizeInBits(ArrayEltTy); + uint64_t ElementOffset = DL->getTypeAllocSizeInBits(ArrayEltTy); + uint64_t ElementSizeBits = DL->getTypeSizeInBits(ArrayEltTy); uint64_t Shift; - if (TD->isBigEndian()) + if (DL->isBigEndian()) Shift = AllocaSizeBits-ElementOffset; else Shift = 0; @@ -2430,7 +2427,7 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, } new StoreInst(EltVal, DestField, SI); - if (TD->isBigEndian()) + if (DL->isBigEndian()) Shift -= ElementOffset; else Shift += ElementOffset; @@ -2448,20 +2445,20 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, // Extract each element out of the NewElts according to its structure offset // and form the result value. Type *AllocaEltTy = AI->getAllocatedType(); - uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy); + uint64_t AllocaSizeBits = DL->getTypeAllocSizeInBits(AllocaEltTy); DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI << '\n'); // There are two forms here: AI could be an array or struct. Both cases // have different ways to compute the element offset. - const StructLayout *Layout = 0; + const StructLayout *Layout = nullptr; uint64_t ArrayEltBitOffset = 0; if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { - Layout = TD->getStructLayout(EltSTy); + Layout = DL->getStructLayout(EltSTy); } else { Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType(); - ArrayEltBitOffset = TD->getTypeAllocSizeInBits(ArrayEltTy); + ArrayEltBitOffset = DL->getTypeAllocSizeInBits(ArrayEltTy); } Value *ResultVal = @@ -2473,7 +2470,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, Value *SrcField = NewElts[i]; Type *FieldTy = cast<PointerType>(SrcField->getType())->getElementType(); - uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy); + uint64_t FieldSizeBits = DL->getTypeSizeInBits(FieldTy); // Ignore zero sized fields like {}, they obviously contain no data. if (FieldSizeBits == 0) continue; @@ -2504,7 +2501,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, else // Array case. Shift = i*ArrayEltBitOffset; - if (TD->isBigEndian()) + if (DL->isBigEndian()) Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth(); if (Shift) { @@ -2521,7 +2518,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, } // Handle tail padding by truncating the result - if (TD->getTypeSizeInBits(LI->getType()) != AllocaSizeBits) + if (DL->getTypeSizeInBits(LI->getType()) != AllocaSizeBits) ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI); LI->replaceAllUsesWith(ResultVal); @@ -2531,15 +2528,15 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, /// HasPadding - Return true if the specified type has any structure or /// alignment padding in between the elements that would be split apart /// by SROA; return false otherwise. -static bool HasPadding(Type *Ty, const DataLayout &TD) { +static bool HasPadding(Type *Ty, const DataLayout &DL) { if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { Ty = ATy->getElementType(); - return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty); + return DL.getTypeSizeInBits(Ty) != DL.getTypeAllocSizeInBits(Ty); } // SROA currently handles only Arrays and Structs. StructType *STy = cast<StructType>(Ty); - const StructLayout *SL = TD.getStructLayout(STy); + const StructLayout *SL = DL.getStructLayout(STy); unsigned PrevFieldBitOffset = 0; for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { unsigned FieldBitOffset = SL->getElementOffsetInBits(i); @@ -2548,7 +2545,7 @@ static bool HasPadding(Type *Ty, const DataLayout &TD) { // previous one. if (i) { unsigned PrevFieldEnd = - PrevFieldBitOffset+TD.getTypeSizeInBits(STy->getElementType(i-1)); + PrevFieldBitOffset+DL.getTypeSizeInBits(STy->getElementType(i-1)); if (PrevFieldEnd < FieldBitOffset) return true; } @@ -2557,7 +2554,7 @@ static bool HasPadding(Type *Ty, const DataLayout &TD) { // Check for tail padding. if (unsigned EltCount = STy->getNumElements()) { unsigned PrevFieldEnd = PrevFieldBitOffset + - TD.getTypeSizeInBits(STy->getElementType(EltCount-1)); + DL.getTypeSizeInBits(STy->getElementType(EltCount-1)); if (PrevFieldEnd < SL->getSizeInBits()) return true; } @@ -2584,7 +2581,7 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) { // types, but may actually be used. In these cases, we refuse to promote the // struct. if (Info.isMemCpySrc && Info.isMemCpyDst && - HasPadding(AI->getAllocatedType(), *TD)) + HasPadding(AI->getAllocatedType(), *DL)) return false; // If the alloca never has an access to just *part* of it, but is accessed diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp new file mode 100644 index 0000000..7a73f11 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -0,0 +1,663 @@ +//===--- Scalarizer.cpp - Scalarize vector operations ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass converts vector operations into scalar operations, in order +// to expose optimization opportunities on the individual scalar operations. +// It is mainly intended for targets that do not have vector units, but it +// may also be useful for revectorizing code to different vector widths. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/STLExtras.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "scalarizer" + +namespace { +// Used to store the scattered form of a vector. +typedef SmallVector<Value *, 8> ValueVector; + +// Used to map a vector Value to its scattered form. We use std::map +// because we want iterators to persist across insertion and because the +// values are relatively large. +typedef std::map<Value *, ValueVector> ScatterMap; + +// Lists Instructions that have been replaced with scalar implementations, +// along with a pointer to their scattered forms. +typedef SmallVector<std::pair<Instruction *, ValueVector *>, 16> GatherList; + +// Provides a very limited vector-like interface for lazily accessing one +// component of a scattered vector or vector pointer. +class Scatterer { +public: + Scatterer() {} + + // Scatter V into Size components. If new instructions are needed, + // insert them before BBI in BB. If Cache is nonnull, use it to cache + // the results. + Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, + ValueVector *cachePtr = nullptr); + + // Return component I, creating a new Value for it if necessary. + Value *operator[](unsigned I); + + // Return the number of components. + unsigned size() const { return Size; } + +private: + BasicBlock *BB; + BasicBlock::iterator BBI; + Value *V; + ValueVector *CachePtr; + PointerType *PtrTy; + ValueVector Tmp; + unsigned Size; +}; + +// FCmpSpliiter(FCI)(Builder, X, Y, Name) uses Builder to create an FCmp +// called Name that compares X and Y in the same way as FCI. +struct FCmpSplitter { + FCmpSplitter(FCmpInst &fci) : FCI(fci) {} + Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, + const Twine &Name) const { + return Builder.CreateFCmp(FCI.getPredicate(), Op0, Op1, Name); + } + FCmpInst &FCI; +}; + +// ICmpSpliiter(ICI)(Builder, X, Y, Name) uses Builder to create an ICmp +// called Name that compares X and Y in the same way as ICI. +struct ICmpSplitter { + ICmpSplitter(ICmpInst &ici) : ICI(ici) {} + Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, + const Twine &Name) const { + return Builder.CreateICmp(ICI.getPredicate(), Op0, Op1, Name); + } + ICmpInst &ICI; +}; + +// BinarySpliiter(BO)(Builder, X, Y, Name) uses Builder to create +// a binary operator like BO called Name with operands X and Y. +struct BinarySplitter { + BinarySplitter(BinaryOperator &bo) : BO(bo) {} + Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, + const Twine &Name) const { + return Builder.CreateBinOp(BO.getOpcode(), Op0, Op1, Name); + } + BinaryOperator &BO; +}; + +// Information about a load or store that we're scalarizing. +struct VectorLayout { + VectorLayout() : VecTy(nullptr), ElemTy(nullptr), VecAlign(0), ElemSize(0) {} + + // Return the alignment of element I. + uint64_t getElemAlign(unsigned I) { + return MinAlign(VecAlign, I * ElemSize); + } + + // The type of the vector. + VectorType *VecTy; + + // The type of each element. + Type *ElemTy; + + // The alignment of the vector. + uint64_t VecAlign; + + // The size of each element. + uint64_t ElemSize; +}; + +class Scalarizer : public FunctionPass, + public InstVisitor<Scalarizer, bool> { +public: + static char ID; + + Scalarizer() : + FunctionPass(ID) { + initializeScalarizerPass(*PassRegistry::getPassRegistry()); + } + + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + + // InstVisitor methods. They return true if the instruction was scalarized, + // false if nothing changed. + bool visitInstruction(Instruction &) { return false; } + bool visitSelectInst(SelectInst &SI); + bool visitICmpInst(ICmpInst &); + bool visitFCmpInst(FCmpInst &); + bool visitBinaryOperator(BinaryOperator &); + bool visitGetElementPtrInst(GetElementPtrInst &); + bool visitCastInst(CastInst &); + bool visitBitCastInst(BitCastInst &); + bool visitShuffleVectorInst(ShuffleVectorInst &); + bool visitPHINode(PHINode &); + bool visitLoadInst(LoadInst &); + bool visitStoreInst(StoreInst &); + +private: + Scatterer scatter(Instruction *, Value *); + void gather(Instruction *, const ValueVector &); + bool canTransferMetadata(unsigned Kind); + void transferMetadata(Instruction *, const ValueVector &); + bool getVectorLayout(Type *, unsigned, VectorLayout &); + bool finish(); + + template<typename T> bool splitBinary(Instruction &, const T &); + + ScatterMap Scattered; + GatherList Gathered; + unsigned ParallelLoopAccessMDKind; + const DataLayout *DL; +}; + +char Scalarizer::ID = 0; +} // end anonymous namespace + +// This is disabled by default because having separate loads and stores makes +// it more likely that the -combiner-alias-analysis limits will be reached. +static cl::opt<bool> ScalarizeLoadStore + ("scalarize-load-store", cl::Hidden, cl::init(false), + cl::desc("Allow the scalarizer pass to scalarize loads and store")); + +INITIALIZE_PASS(Scalarizer, "scalarizer", "Scalarize vector operations", + false, false) + +Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, + ValueVector *cachePtr) + : BB(bb), BBI(bbi), V(v), CachePtr(cachePtr) { + Type *Ty = V->getType(); + PtrTy = dyn_cast<PointerType>(Ty); + if (PtrTy) + Ty = PtrTy->getElementType(); + Size = Ty->getVectorNumElements(); + if (!CachePtr) + Tmp.resize(Size, nullptr); + else if (CachePtr->empty()) + CachePtr->resize(Size, nullptr); + else + assert(Size == CachePtr->size() && "Inconsistent vector sizes"); +} + +// Return component I, creating a new Value for it if necessary. +Value *Scatterer::operator[](unsigned I) { + ValueVector &CV = (CachePtr ? *CachePtr : Tmp); + // Try to reuse a previous value. + if (CV[I]) + return CV[I]; + IRBuilder<> Builder(BB, BBI); + if (PtrTy) { + if (!CV[0]) { + Type *Ty = + PointerType::get(PtrTy->getElementType()->getVectorElementType(), + PtrTy->getAddressSpace()); + CV[0] = Builder.CreateBitCast(V, Ty, V->getName() + ".i0"); + } + if (I != 0) + CV[I] = Builder.CreateConstGEP1_32(CV[0], I, + V->getName() + ".i" + Twine(I)); + } else { + // Search through a chain of InsertElementInsts looking for element I. + // Record other elements in the cache. The new V is still suitable + // for all uncached indices. + for (;;) { + InsertElementInst *Insert = dyn_cast<InsertElementInst>(V); + if (!Insert) + break; + ConstantInt *Idx = dyn_cast<ConstantInt>(Insert->getOperand(2)); + if (!Idx) + break; + unsigned J = Idx->getZExtValue(); + CV[J] = Insert->getOperand(1); + V = Insert->getOperand(0); + if (I == J) + return CV[J]; + } + CV[I] = Builder.CreateExtractElement(V, Builder.getInt32(I), + V->getName() + ".i" + Twine(I)); + } + return CV[I]; +} + +bool Scalarizer::doInitialization(Module &M) { + ParallelLoopAccessMDKind = + M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); + return false; +} + +bool Scalarizer::runOnFunction(Function &F) { + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; + for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) { + BasicBlock *BB = BBI; + for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) { + Instruction *I = II; + bool Done = visit(I); + ++II; + if (Done && I->getType()->isVoidTy()) + I->eraseFromParent(); + } + } + return finish(); +} + +// Return a scattered form of V that can be accessed by Point. V must be a +// vector or a pointer to a vector. +Scatterer Scalarizer::scatter(Instruction *Point, Value *V) { + if (Argument *VArg = dyn_cast<Argument>(V)) { + // Put the scattered form of arguments in the entry block, + // so that it can be used everywhere. + Function *F = VArg->getParent(); + BasicBlock *BB = &F->getEntryBlock(); + return Scatterer(BB, BB->begin(), V, &Scattered[V]); + } + if (Instruction *VOp = dyn_cast<Instruction>(V)) { + // Put the scattered form of an instruction directly after the + // instruction. + BasicBlock *BB = VOp->getParent(); + return Scatterer(BB, std::next(BasicBlock::iterator(VOp)), + V, &Scattered[V]); + } + // In the fallback case, just put the scattered before Point and + // keep the result local to Point. + return Scatterer(Point->getParent(), Point, V); +} + +// Replace Op with the gathered form of the components in CV. Defer the +// deletion of Op and creation of the gathered form to the end of the pass, +// so that we can avoid creating the gathered form if all uses of Op are +// replaced with uses of CV. +void Scalarizer::gather(Instruction *Op, const ValueVector &CV) { + // Since we're not deleting Op yet, stub out its operands, so that it + // doesn't make anything live unnecessarily. + for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) + Op->setOperand(I, UndefValue::get(Op->getOperand(I)->getType())); + + transferMetadata(Op, CV); + + // If we already have a scattered form of Op (created from ExtractElements + // of Op itself), replace them with the new form. + ValueVector &SV = Scattered[Op]; + if (!SV.empty()) { + for (unsigned I = 0, E = SV.size(); I != E; ++I) { + Instruction *Old = cast<Instruction>(SV[I]); + CV[I]->takeName(Old); + Old->replaceAllUsesWith(CV[I]); + Old->eraseFromParent(); + } + } + SV = CV; + Gathered.push_back(GatherList::value_type(Op, &SV)); +} + +// Return true if it is safe to transfer the given metadata tag from +// vector to scalar instructions. +bool Scalarizer::canTransferMetadata(unsigned Tag) { + return (Tag == LLVMContext::MD_tbaa + || Tag == LLVMContext::MD_fpmath + || Tag == LLVMContext::MD_tbaa_struct + || Tag == LLVMContext::MD_invariant_load + || Tag == ParallelLoopAccessMDKind); +} + +// Transfer metadata from Op to the instructions in CV if it is known +// to be safe to do so. +void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) { + SmallVector<std::pair<unsigned, MDNode *>, 4> MDs; + Op->getAllMetadataOtherThanDebugLoc(MDs); + for (unsigned I = 0, E = CV.size(); I != E; ++I) { + if (Instruction *New = dyn_cast<Instruction>(CV[I])) { + for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator + MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI) + if (canTransferMetadata(MI->first)) + New->setMetadata(MI->first, MI->second); + New->setDebugLoc(Op->getDebugLoc()); + } + } +} + +// Try to fill in Layout from Ty, returning true on success. Alignment is +// the alignment of the vector, or 0 if the ABI default should be used. +bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment, + VectorLayout &Layout) { + if (!DL) + return false; + + // Make sure we're dealing with a vector. + Layout.VecTy = dyn_cast<VectorType>(Ty); + if (!Layout.VecTy) + return false; + + // Check that we're dealing with full-byte elements. + Layout.ElemTy = Layout.VecTy->getElementType(); + if (DL->getTypeSizeInBits(Layout.ElemTy) != + DL->getTypeStoreSizeInBits(Layout.ElemTy)) + return false; + + if (Alignment) + Layout.VecAlign = Alignment; + else + Layout.VecAlign = DL->getABITypeAlignment(Layout.VecTy); + Layout.ElemSize = DL->getTypeStoreSize(Layout.ElemTy); + return true; +} + +// Scalarize two-operand instruction I, using Split(Builder, X, Y, Name) +// to create an instruction like I with operands X and Y and name Name. +template<typename Splitter> +bool Scalarizer::splitBinary(Instruction &I, const Splitter &Split) { + VectorType *VT = dyn_cast<VectorType>(I.getType()); + if (!VT) + return false; + + unsigned NumElems = VT->getNumElements(); + IRBuilder<> Builder(I.getParent(), &I); + Scatterer Op0 = scatter(&I, I.getOperand(0)); + Scatterer Op1 = scatter(&I, I.getOperand(1)); + assert(Op0.size() == NumElems && "Mismatched binary operation"); + assert(Op1.size() == NumElems && "Mismatched binary operation"); + ValueVector Res; + Res.resize(NumElems); + for (unsigned Elem = 0; Elem < NumElems; ++Elem) + Res[Elem] = Split(Builder, Op0[Elem], Op1[Elem], + I.getName() + ".i" + Twine(Elem)); + gather(&I, Res); + return true; +} + +bool Scalarizer::visitSelectInst(SelectInst &SI) { + VectorType *VT = dyn_cast<VectorType>(SI.getType()); + if (!VT) + return false; + + unsigned NumElems = VT->getNumElements(); + IRBuilder<> Builder(SI.getParent(), &SI); + Scatterer Op1 = scatter(&SI, SI.getOperand(1)); + Scatterer Op2 = scatter(&SI, SI.getOperand(2)); + assert(Op1.size() == NumElems && "Mismatched select"); + assert(Op2.size() == NumElems && "Mismatched select"); + ValueVector Res; + Res.resize(NumElems); + + if (SI.getOperand(0)->getType()->isVectorTy()) { + Scatterer Op0 = scatter(&SI, SI.getOperand(0)); + assert(Op0.size() == NumElems && "Mismatched select"); + for (unsigned I = 0; I < NumElems; ++I) + Res[I] = Builder.CreateSelect(Op0[I], Op1[I], Op2[I], + SI.getName() + ".i" + Twine(I)); + } else { + Value *Op0 = SI.getOperand(0); + for (unsigned I = 0; I < NumElems; ++I) + Res[I] = Builder.CreateSelect(Op0, Op1[I], Op2[I], + SI.getName() + ".i" + Twine(I)); + } + gather(&SI, Res); + return true; +} + +bool Scalarizer::visitICmpInst(ICmpInst &ICI) { + return splitBinary(ICI, ICmpSplitter(ICI)); +} + +bool Scalarizer::visitFCmpInst(FCmpInst &FCI) { + return splitBinary(FCI, FCmpSplitter(FCI)); +} + +bool Scalarizer::visitBinaryOperator(BinaryOperator &BO) { + return splitBinary(BO, BinarySplitter(BO)); +} + +bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) { + VectorType *VT = dyn_cast<VectorType>(GEPI.getType()); + if (!VT) + return false; + + IRBuilder<> Builder(GEPI.getParent(), &GEPI); + unsigned NumElems = VT->getNumElements(); + unsigned NumIndices = GEPI.getNumIndices(); + + Scatterer Base = scatter(&GEPI, GEPI.getOperand(0)); + + SmallVector<Scatterer, 8> Ops; + Ops.resize(NumIndices); + for (unsigned I = 0; I < NumIndices; ++I) + Ops[I] = scatter(&GEPI, GEPI.getOperand(I + 1)); + + ValueVector Res; + Res.resize(NumElems); + for (unsigned I = 0; I < NumElems; ++I) { + SmallVector<Value *, 8> Indices; + Indices.resize(NumIndices); + for (unsigned J = 0; J < NumIndices; ++J) + Indices[J] = Ops[J][I]; + Res[I] = Builder.CreateGEP(Base[I], Indices, + GEPI.getName() + ".i" + Twine(I)); + if (GEPI.isInBounds()) + if (GetElementPtrInst *NewGEPI = dyn_cast<GetElementPtrInst>(Res[I])) + NewGEPI->setIsInBounds(); + } + gather(&GEPI, Res); + return true; +} + +bool Scalarizer::visitCastInst(CastInst &CI) { + VectorType *VT = dyn_cast<VectorType>(CI.getDestTy()); + if (!VT) + return false; + + unsigned NumElems = VT->getNumElements(); + IRBuilder<> Builder(CI.getParent(), &CI); + Scatterer Op0 = scatter(&CI, CI.getOperand(0)); + assert(Op0.size() == NumElems && "Mismatched cast"); + ValueVector Res; + Res.resize(NumElems); + for (unsigned I = 0; I < NumElems; ++I) + Res[I] = Builder.CreateCast(CI.getOpcode(), Op0[I], VT->getElementType(), + CI.getName() + ".i" + Twine(I)); + gather(&CI, Res); + return true; +} + +bool Scalarizer::visitBitCastInst(BitCastInst &BCI) { + VectorType *DstVT = dyn_cast<VectorType>(BCI.getDestTy()); + VectorType *SrcVT = dyn_cast<VectorType>(BCI.getSrcTy()); + if (!DstVT || !SrcVT) + return false; + + unsigned DstNumElems = DstVT->getNumElements(); + unsigned SrcNumElems = SrcVT->getNumElements(); + IRBuilder<> Builder(BCI.getParent(), &BCI); + Scatterer Op0 = scatter(&BCI, BCI.getOperand(0)); + ValueVector Res; + Res.resize(DstNumElems); + + if (DstNumElems == SrcNumElems) { + for (unsigned I = 0; I < DstNumElems; ++I) + Res[I] = Builder.CreateBitCast(Op0[I], DstVT->getElementType(), + BCI.getName() + ".i" + Twine(I)); + } else if (DstNumElems > SrcNumElems) { + // <M x t1> -> <N*M x t2>. Convert each t1 to <N x t2> and copy the + // individual elements to the destination. + unsigned FanOut = DstNumElems / SrcNumElems; + Type *MidTy = VectorType::get(DstVT->getElementType(), FanOut); + unsigned ResI = 0; + for (unsigned Op0I = 0; Op0I < SrcNumElems; ++Op0I) { + Value *V = Op0[Op0I]; + Instruction *VI; + // Look through any existing bitcasts before converting to <N x t2>. + // In the best case, the resulting conversion might be a no-op. + while ((VI = dyn_cast<Instruction>(V)) && + VI->getOpcode() == Instruction::BitCast) + V = VI->getOperand(0); + V = Builder.CreateBitCast(V, MidTy, V->getName() + ".cast"); + Scatterer Mid = scatter(&BCI, V); + for (unsigned MidI = 0; MidI < FanOut; ++MidI) + Res[ResI++] = Mid[MidI]; + } + } else { + // <N*M x t1> -> <M x t2>. Convert each group of <N x t1> into a t2. + unsigned FanIn = SrcNumElems / DstNumElems; + Type *MidTy = VectorType::get(SrcVT->getElementType(), FanIn); + unsigned Op0I = 0; + for (unsigned ResI = 0; ResI < DstNumElems; ++ResI) { + Value *V = UndefValue::get(MidTy); + for (unsigned MidI = 0; MidI < FanIn; ++MidI) + V = Builder.CreateInsertElement(V, Op0[Op0I++], Builder.getInt32(MidI), + BCI.getName() + ".i" + Twine(ResI) + + ".upto" + Twine(MidI)); + Res[ResI] = Builder.CreateBitCast(V, DstVT->getElementType(), + BCI.getName() + ".i" + Twine(ResI)); + } + } + gather(&BCI, Res); + return true; +} + +bool Scalarizer::visitShuffleVectorInst(ShuffleVectorInst &SVI) { + VectorType *VT = dyn_cast<VectorType>(SVI.getType()); + if (!VT) + return false; + + unsigned NumElems = VT->getNumElements(); + Scatterer Op0 = scatter(&SVI, SVI.getOperand(0)); + Scatterer Op1 = scatter(&SVI, SVI.getOperand(1)); + ValueVector Res; + Res.resize(NumElems); + + for (unsigned I = 0; I < NumElems; ++I) { + int Selector = SVI.getMaskValue(I); + if (Selector < 0) + Res[I] = UndefValue::get(VT->getElementType()); + else if (unsigned(Selector) < Op0.size()) + Res[I] = Op0[Selector]; + else + Res[I] = Op1[Selector - Op0.size()]; + } + gather(&SVI, Res); + return true; +} + +bool Scalarizer::visitPHINode(PHINode &PHI) { + VectorType *VT = dyn_cast<VectorType>(PHI.getType()); + if (!VT) + return false; + + unsigned NumElems = VT->getNumElements(); + IRBuilder<> Builder(PHI.getParent(), &PHI); + ValueVector Res; + Res.resize(NumElems); + + unsigned NumOps = PHI.getNumOperands(); + for (unsigned I = 0; I < NumElems; ++I) + Res[I] = Builder.CreatePHI(VT->getElementType(), NumOps, + PHI.getName() + ".i" + Twine(I)); + + for (unsigned I = 0; I < NumOps; ++I) { + Scatterer Op = scatter(&PHI, PHI.getIncomingValue(I)); + BasicBlock *IncomingBlock = PHI.getIncomingBlock(I); + for (unsigned J = 0; J < NumElems; ++J) + cast<PHINode>(Res[J])->addIncoming(Op[J], IncomingBlock); + } + gather(&PHI, Res); + return true; +} + +bool Scalarizer::visitLoadInst(LoadInst &LI) { + if (!ScalarizeLoadStore) + return false; + if (!LI.isSimple()) + return false; + + VectorLayout Layout; + if (!getVectorLayout(LI.getType(), LI.getAlignment(), Layout)) + return false; + + unsigned NumElems = Layout.VecTy->getNumElements(); + IRBuilder<> Builder(LI.getParent(), &LI); + Scatterer Ptr = scatter(&LI, LI.getPointerOperand()); + ValueVector Res; + Res.resize(NumElems); + + for (unsigned I = 0; I < NumElems; ++I) + Res[I] = Builder.CreateAlignedLoad(Ptr[I], Layout.getElemAlign(I), + LI.getName() + ".i" + Twine(I)); + gather(&LI, Res); + return true; +} + +bool Scalarizer::visitStoreInst(StoreInst &SI) { + if (!ScalarizeLoadStore) + return false; + if (!SI.isSimple()) + return false; + + VectorLayout Layout; + Value *FullValue = SI.getValueOperand(); + if (!getVectorLayout(FullValue->getType(), SI.getAlignment(), Layout)) + return false; + + unsigned NumElems = Layout.VecTy->getNumElements(); + IRBuilder<> Builder(SI.getParent(), &SI); + Scatterer Ptr = scatter(&SI, SI.getPointerOperand()); + Scatterer Val = scatter(&SI, FullValue); + + ValueVector Stores; + Stores.resize(NumElems); + for (unsigned I = 0; I < NumElems; ++I) { + unsigned Align = Layout.getElemAlign(I); + Stores[I] = Builder.CreateAlignedStore(Val[I], Ptr[I], Align); + } + transferMetadata(&SI, Stores); + return true; +} + +// Delete the instructions that we scalarized. If a full vector result +// is still needed, recreate it using InsertElements. +bool Scalarizer::finish() { + if (Gathered.empty()) + return false; + for (GatherList::iterator GMI = Gathered.begin(), GME = Gathered.end(); + GMI != GME; ++GMI) { + Instruction *Op = GMI->first; + ValueVector &CV = *GMI->second; + if (!Op->use_empty()) { + // The value is still needed, so recreate it using a series of + // InsertElements. + Type *Ty = Op->getType(); + Value *Res = UndefValue::get(Ty); + BasicBlock *BB = Op->getParent(); + unsigned Count = Ty->getVectorNumElements(); + IRBuilder<> Builder(BB, Op); + if (isa<PHINode>(Op)) + Builder.SetInsertPoint(BB, BB->getFirstInsertionPt()); + for (unsigned I = 0; I < Count; ++I) + Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I), + Op->getName() + ".upto" + Twine(I)); + Res->takeName(Op); + Op->replaceAllUsesWith(Res); + } + Op->eraseFromParent(); + } + Gathered.clear(); + Scattered.clear(); + return true; +} + +FunctionPass *llvm::createScalarizerPass() { + return new Scalarizer(); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp new file mode 100644 index 0000000..6557ce4 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -0,0 +1,776 @@ +//===-- SeparateConstOffsetFromGEP.cpp - ------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Loop unrolling may create many similar GEPs for array accesses. +// e.g., a 2-level loop +// +// float a[32][32]; // global variable +// +// for (int i = 0; i < 2; ++i) { +// for (int j = 0; j < 2; ++j) { +// ... +// ... = a[x + i][y + j]; +// ... +// } +// } +// +// will probably be unrolled to: +// +// gep %a, 0, %x, %y; load +// gep %a, 0, %x, %y + 1; load +// gep %a, 0, %x + 1, %y; load +// gep %a, 0, %x + 1, %y + 1; load +// +// LLVM's GVN does not use partial redundancy elimination yet, and is thus +// unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs +// significant slowdown in targets with limited addressing modes. For instance, +// because the PTX target does not support the reg+reg addressing mode, the +// NVPTX backend emits PTX code that literally computes the pointer address of +// each GEP, wasting tons of registers. It emits the following PTX for the +// first load and similar PTX for other loads. +// +// mov.u32 %r1, %x; +// mov.u32 %r2, %y; +// mul.wide.u32 %rl2, %r1, 128; +// mov.u64 %rl3, a; +// add.s64 %rl4, %rl3, %rl2; +// mul.wide.u32 %rl5, %r2, 4; +// add.s64 %rl6, %rl4, %rl5; +// ld.global.f32 %f1, [%rl6]; +// +// To reduce the register pressure, the optimization implemented in this file +// merges the common part of a group of GEPs, so we can compute each pointer +// address by adding a simple offset to the common part, saving many registers. +// +// It works by splitting each GEP into a variadic base and a constant offset. +// The variadic base can be computed once and reused by multiple GEPs, and the +// constant offsets can be nicely folded into the reg+immediate addressing mode +// (supported by most targets) without using any extra register. +// +// For instance, we transform the four GEPs and four loads in the above example +// into: +// +// base = gep a, 0, x, y +// load base +// laod base + 1 * sizeof(float) +// load base + 32 * sizeof(float) +// load base + 33 * sizeof(float) +// +// Given the transformed IR, a backend that supports the reg+immediate +// addressing mode can easily fold the pointer arithmetics into the loads. For +// example, the NVPTX backend can easily fold the pointer arithmetics into the +// ld.global.f32 instructions, and the resultant PTX uses much fewer registers. +// +// mov.u32 %r1, %tid.x; +// mov.u32 %r2, %tid.y; +// mul.wide.u32 %rl2, %r1, 128; +// mov.u64 %rl3, a; +// add.s64 %rl4, %rl3, %rl2; +// mul.wide.u32 %rl5, %r2, 4; +// add.s64 %rl6, %rl4, %rl5; +// ld.global.f32 %f1, [%rl6]; // so far the same as unoptimized PTX +// ld.global.f32 %f2, [%rl6+4]; // much better +// ld.global.f32 %f3, [%rl6+128]; // much better +// ld.global.f32 %f4, [%rl6+132]; // much better +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; + +static cl::opt<bool> DisableSeparateConstOffsetFromGEP( + "disable-separate-const-offset-from-gep", cl::init(false), + cl::desc("Do not separate the constant offset from a GEP instruction"), + cl::Hidden); + +namespace { + +/// \brief A helper class for separating a constant offset from a GEP index. +/// +/// In real programs, a GEP index may be more complicated than a simple addition +/// of something and a constant integer which can be trivially splitted. For +/// example, to split ((a << 3) | 5) + b, we need to search deeper for the +/// constant offset, so that we can separate the index to (a << 3) + b and 5. +/// +/// Therefore, this class looks into the expression that computes a given GEP +/// index, and tries to find a constant integer that can be hoisted to the +/// outermost level of the expression as an addition. Not every constant in an +/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a + +/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case, +/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15). +class ConstantOffsetExtractor { + public: + /// Extracts a constant offset from the given GEP index. It outputs the + /// numeric value of the extracted constant offset (0 if failed), and a + /// new index representing the remainder (equal to the original index minus + /// the constant offset). + /// \p Idx The given GEP index + /// \p NewIdx The new index to replace (output) + /// \p DL The datalayout of the module + /// \p GEP The given GEP + static int64_t Extract(Value *Idx, Value *&NewIdx, const DataLayout *DL, + GetElementPtrInst *GEP); + /// Looks for a constant offset without extracting it. The meaning of the + /// arguments and the return value are the same as Extract. + static int64_t Find(Value *Idx, const DataLayout *DL, GetElementPtrInst *GEP); + + private: + ConstantOffsetExtractor(const DataLayout *Layout, Instruction *InsertionPt) + : DL(Layout), IP(InsertionPt) {} + /// Searches the expression that computes V for a non-zero constant C s.t. + /// V can be reassociated into the form V' + C. If the searching is + /// successful, returns C and update UserChain as a def-use chain from C to V; + /// otherwise, UserChain is empty. + /// + /// \p V The given expression + /// \p SignExtended Whether V will be sign-extended in the computation of the + /// GEP index + /// \p ZeroExtended Whether V will be zero-extended in the computation of the + /// GEP index + /// \p NonNegative Whether V is guaranteed to be non-negative. For example, + /// an index of an inbounds GEP is guaranteed to be + /// non-negative. Levaraging this, we can better split + /// inbounds GEPs. + APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative); + /// A helper function to look into both operands of a binary operator. + APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended, + bool ZeroExtended); + /// After finding the constant offset C from the GEP index I, we build a new + /// index I' s.t. I' + C = I. This function builds and returns the new + /// index I' according to UserChain produced by function "find". + /// + /// The building conceptually takes two steps: + /// 1) iteratively distribute s/zext towards the leaves of the expression tree + /// that computes I + /// 2) reassociate the expression tree to the form I' + C. + /// + /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute + /// sext to a, b and 5 so that we have + /// sext(a) + (sext(b) + 5). + /// Then, we reassociate it to + /// (sext(a) + sext(b)) + 5. + /// Given this form, we know I' is sext(a) + sext(b). + Value *rebuildWithoutConstOffset(); + /// After the first step of rebuilding the GEP index without the constant + /// offset, distribute s/zext to the operands of all operators in UserChain. + /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) => + /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))). + /// + /// The function also updates UserChain to point to new subexpressions after + /// distributing s/zext. e.g., the old UserChain of the above example is + /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)), + /// and the new UserChain is + /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) -> + /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5)) + /// + /// \p ChainIndex The index to UserChain. ChainIndex is initially + /// UserChain.size() - 1, and is decremented during + /// the recursion. + Value *distributeExtsAndCloneChain(unsigned ChainIndex); + /// Reassociates the GEP index to the form I' + C and returns I'. + Value *removeConstOffset(unsigned ChainIndex); + /// A helper function to apply ExtInsts, a list of s/zext, to value V. + /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function + /// returns "sext i32 (zext i16 V to i32) to i64". + Value *applyExts(Value *V); + + /// Returns true if LHS and RHS have no bits in common, i.e., LHS | RHS == 0. + bool NoCommonBits(Value *LHS, Value *RHS) const; + /// Computes which bits are known to be one or zero. + /// \p KnownOne Mask of all bits that are known to be one. + /// \p KnownZero Mask of all bits that are known to be zero. + void ComputeKnownBits(Value *V, APInt &KnownOne, APInt &KnownZero) const; + /// A helper function that returns whether we can trace into the operands + /// of binary operator BO for a constant offset. + /// + /// \p SignExtended Whether BO is surrounded by sext + /// \p ZeroExtended Whether BO is surrounded by zext + /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound + /// array index. + bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, + bool NonNegative); + + /// The path from the constant offset to the old GEP index. e.g., if the GEP + /// index is "a * b + (c + 5)". After running function find, UserChain[0] will + /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and + /// UserChain[2] will be the entire expression "a * b + (c + 5)". + /// + /// This path helps to rebuild the new GEP index. + SmallVector<User *, 8> UserChain; + /// A data structure used in rebuildWithoutConstOffset. Contains all + /// sext/zext instructions along UserChain. + SmallVector<CastInst *, 16> ExtInsts; + /// The data layout of the module. Used in ComputeKnownBits. + const DataLayout *DL; + Instruction *IP; /// Insertion position of cloned instructions. +}; + +/// \brief A pass that tries to split every GEP in the function into a variadic +/// base and a constant offset. It is a FunctionPass because searching for the +/// constant offset may inspect other basic blocks. +class SeparateConstOffsetFromGEP : public FunctionPass { + public: + static char ID; + SeparateConstOffsetFromGEP() : FunctionPass(ID) { + initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DataLayoutPass>(); + AU.addRequired<TargetTransformInfo>(); + } + + bool doInitialization(Module &M) override { + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + if (DLP == nullptr) + report_fatal_error("data layout missing"); + DL = &DLP->getDataLayout(); + return false; + } + + bool runOnFunction(Function &F) override; + + private: + /// Tries to split the given GEP into a variadic base and a constant offset, + /// and returns true if the splitting succeeds. + bool splitGEP(GetElementPtrInst *GEP); + /// Finds the constant offset within each index, and accumulates them. This + /// function only inspects the GEP without changing it. The output + /// NeedsExtraction indicates whether we can extract a non-zero constant + /// offset from any index. + int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction); + /// Canonicalize array indices to pointer-size integers. This helps to + /// simplify the logic of splitting a GEP. For example, if a + b is a + /// pointer-size integer, we have + /// gep base, a + b = gep (gep base, a), b + /// However, this equality may not hold if the size of a + b is smaller than + /// the pointer size, because LLVM conceptually sign-extends GEP indices to + /// pointer size before computing the address + /// (http://llvm.org/docs/LangRef.html#id181). + /// + /// This canonicalization is very likely already done in clang and + /// instcombine. Therefore, the program will probably remain the same. + /// + /// Returns true if the module changes. + /// + /// Verified in @i32_add in split-gep.ll + bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); + + const DataLayout *DL; +}; +} // anonymous namespace + +char SeparateConstOffsetFromGEP::ID = 0; +INITIALIZE_PASS_BEGIN( + SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", + "Split GEPs to a variadic base and a constant offset for better CSE", false, + false) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(DataLayoutPass) +INITIALIZE_PASS_END( + SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", + "Split GEPs to a variadic base and a constant offset for better CSE", false, + false) + +FunctionPass *llvm::createSeparateConstOffsetFromGEPPass() { + return new SeparateConstOffsetFromGEP(); +} + +bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, + bool ZeroExtended, + BinaryOperator *BO, + bool NonNegative) { + // We only consider ADD, SUB and OR, because a non-zero constant found in + // expressions composed of these operations can be easily hoisted as a + // constant offset by reassociation. + if (BO->getOpcode() != Instruction::Add && + BO->getOpcode() != Instruction::Sub && + BO->getOpcode() != Instruction::Or) { + return false; + } + + Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1); + // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS + // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS). + if (BO->getOpcode() == Instruction::Or && !NoCommonBits(LHS, RHS)) + return false; + + // In addition, tracing into BO requires that its surrounding s/zext (if + // any) is distributable to both operands. + // + // Suppose BO = A op B. + // SignExtended | ZeroExtended | Distributable? + // --------------+--------------+---------------------------------- + // 0 | 0 | true because no s/zext exists + // 0 | 1 | zext(BO) == zext(A) op zext(B) + // 1 | 0 | sext(BO) == sext(A) op sext(B) + // 1 | 1 | zext(sext(BO)) == + // | | zext(sext(A)) op zext(sext(B)) + if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) { + // If a + b >= 0 and (a >= 0 or b >= 0), then + // sext(a + b) = sext(a) + sext(b) + // even if the addition is not marked nsw. + // + // Leveraging this invarient, we can trace into an sext'ed inbound GEP + // index if the constant offset is non-negative. + // + // Verified in @sext_add in split-gep.ll. + if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) { + if (!ConstLHS->isNegative()) + return true; + } + if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) { + if (!ConstRHS->isNegative()) + return true; + } + } + + // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B) + // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B) + if (BO->getOpcode() == Instruction::Add || + BO->getOpcode() == Instruction::Sub) { + if (SignExtended && !BO->hasNoSignedWrap()) + return false; + if (ZeroExtended && !BO->hasNoUnsignedWrap()) + return false; + } + + return true; +} + +APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO, + bool SignExtended, + bool ZeroExtended) { + // BO being non-negative does not shed light on whether its operands are + // non-negative. Clear the NonNegative flag here. + APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended, + /* NonNegative */ false); + // If we found a constant offset in the left operand, stop and return that. + // This shortcut might cause us to miss opportunities of combining the + // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9. + // However, such cases are probably already handled by -instcombine, + // given this pass runs after the standard optimizations. + if (ConstantOffset != 0) return ConstantOffset; + ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended, + /* NonNegative */ false); + // If U is a sub operator, negate the constant offset found in the right + // operand. + if (BO->getOpcode() == Instruction::Sub) + ConstantOffset = -ConstantOffset; + return ConstantOffset; +} + +APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, + bool ZeroExtended, bool NonNegative) { + // TODO(jingyue): We could trace into integer/pointer casts, such as + // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only + // integers because it gives good enough results for our benchmarks. + unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth(); + + // We cannot do much with Values that are not a User, such as an Argument. + User *U = dyn_cast<User>(V); + if (U == nullptr) return APInt(BitWidth, 0); + + APInt ConstantOffset(BitWidth, 0); + if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) { + // Hooray, we found it! + ConstantOffset = CI->getValue(); + } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) { + // Trace into subexpressions for more hoisting opportunities. + if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) { + ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended); + } + } else if (isa<SExtInst>(V)) { + ConstantOffset = find(U->getOperand(0), /* SignExtended */ true, + ZeroExtended, NonNegative).sext(BitWidth); + } else if (isa<ZExtInst>(V)) { + // As an optimization, we can clear the SignExtended flag because + // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll. + // + // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0. + ConstantOffset = + find(U->getOperand(0), /* SignExtended */ false, + /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth); + } + + // If we found a non-zero constant offset, add it to the path for + // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't + // help this optimization. + if (ConstantOffset != 0) + UserChain.push_back(U); + return ConstantOffset; +} + +Value *ConstantOffsetExtractor::applyExts(Value *V) { + Value *Current = V; + // ExtInsts is built in the use-def order. Therefore, we apply them to V + // in the reversed order. + for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) { + if (Constant *C = dyn_cast<Constant>(Current)) { + // If Current is a constant, apply s/zext using ConstantExpr::getCast. + // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt. + Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType()); + } else { + Instruction *Ext = (*I)->clone(); + Ext->setOperand(0, Current); + Ext->insertBefore(IP); + Current = Ext; + } + } + return Current; +} + +Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() { + distributeExtsAndCloneChain(UserChain.size() - 1); + // Remove all nullptrs (used to be s/zext) from UserChain. + unsigned NewSize = 0; + for (auto I = UserChain.begin(), E = UserChain.end(); I != E; ++I) { + if (*I != nullptr) { + UserChain[NewSize] = *I; + NewSize++; + } + } + UserChain.resize(NewSize); + return removeConstOffset(UserChain.size() - 1); +} + +Value * +ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) { + User *U = UserChain[ChainIndex]; + if (ChainIndex == 0) { + assert(isa<ConstantInt>(U)); + // If U is a ConstantInt, applyExts will return a ConstantInt as well. + return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U)); + } + + if (CastInst *Cast = dyn_cast<CastInst>(U)) { + assert((isa<SExtInst>(Cast) || isa<ZExtInst>(Cast)) && + "We only traced into two types of CastInst: sext and zext"); + ExtInsts.push_back(Cast); + UserChain[ChainIndex] = nullptr; + return distributeExtsAndCloneChain(ChainIndex - 1); + } + + // Function find only trace into BinaryOperator and CastInst. + BinaryOperator *BO = cast<BinaryOperator>(U); + // OpNo = which operand of BO is UserChain[ChainIndex - 1] + unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); + Value *TheOther = applyExts(BO->getOperand(1 - OpNo)); + Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1); + + BinaryOperator *NewBO = nullptr; + if (OpNo == 0) { + NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther, + BO->getName(), IP); + } else { + NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain, + BO->getName(), IP); + } + return UserChain[ChainIndex] = NewBO; +} + +Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { + if (ChainIndex == 0) { + assert(isa<ConstantInt>(UserChain[ChainIndex])); + return ConstantInt::getNullValue(UserChain[ChainIndex]->getType()); + } + + BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]); + unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); + assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]); + Value *NextInChain = removeConstOffset(ChainIndex - 1); + Value *TheOther = BO->getOperand(1 - OpNo); + + // If NextInChain is 0 and not the LHS of a sub, we can simplify the + // sub-expression to be just TheOther. + if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) { + if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0)) + return TheOther; + } + + if (BO->getOpcode() == Instruction::Or) { + // Rebuild "or" as "add", because "or" may be invalid for the new + // epxression. + // + // For instance, given + // a | (b + 5) where a and b + 5 have no common bits, + // we can extract 5 as the constant offset. + // + // However, reusing the "or" in the new index would give us + // (a | b) + 5 + // which does not equal a | (b + 5). + // + // Replacing the "or" with "add" is fine, because + // a | (b + 5) = a + (b + 5) = (a + b) + 5 + return BinaryOperator::CreateAdd(BO->getOperand(0), BO->getOperand(1), + BO->getName(), IP); + } + + // We can reuse BO in this case, because the new expression shares the same + // instruction type and BO is used at most once. + assert(BO->getNumUses() <= 1 && + "distributeExtsAndCloneChain clones each BinaryOperator in " + "UserChain, so no one should be used more than " + "once"); + BO->setOperand(OpNo, NextInChain); + BO->setHasNoSignedWrap(false); + BO->setHasNoUnsignedWrap(false); + // Make sure it appears after all instructions we've inserted so far. + BO->moveBefore(IP); + return BO; +} + +int64_t ConstantOffsetExtractor::Extract(Value *Idx, Value *&NewIdx, + const DataLayout *DL, + GetElementPtrInst *GEP) { + ConstantOffsetExtractor Extractor(DL, GEP); + // Find a non-zero constant offset first. + APInt ConstantOffset = + Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, + GEP->isInBounds()); + if (ConstantOffset != 0) { + // Separates the constant offset from the GEP index. + NewIdx = Extractor.rebuildWithoutConstOffset(); + } + return ConstantOffset.getSExtValue(); +} + +int64_t ConstantOffsetExtractor::Find(Value *Idx, const DataLayout *DL, + GetElementPtrInst *GEP) { + // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative. + return ConstantOffsetExtractor(DL, GEP) + .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, + GEP->isInBounds()) + .getSExtValue(); +} + +void ConstantOffsetExtractor::ComputeKnownBits(Value *V, APInt &KnownOne, + APInt &KnownZero) const { + IntegerType *IT = cast<IntegerType>(V->getType()); + KnownOne = APInt(IT->getBitWidth(), 0); + KnownZero = APInt(IT->getBitWidth(), 0); + llvm::computeKnownBits(V, KnownZero, KnownOne, DL, 0); +} + +bool ConstantOffsetExtractor::NoCommonBits(Value *LHS, Value *RHS) const { + assert(LHS->getType() == RHS->getType() && + "LHS and RHS should have the same type"); + APInt LHSKnownOne, LHSKnownZero, RHSKnownOne, RHSKnownZero; + ComputeKnownBits(LHS, LHSKnownOne, LHSKnownZero); + ComputeKnownBits(RHS, RHSKnownOne, RHSKnownZero); + return (LHSKnownZero | RHSKnownZero).isAllOnesValue(); +} + +bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize( + GetElementPtrInst *GEP) { + bool Changed = false; + Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); + gep_type_iterator GTI = gep_type_begin(*GEP); + for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end(); + I != E; ++I, ++GTI) { + // Skip struct member indices which must be i32. + if (isa<SequentialType>(*GTI)) { + if ((*I)->getType() != IntPtrTy) { + *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP); + Changed = true; + } + } + } + return Changed; +} + +int64_t +SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, + bool &NeedsExtraction) { + NeedsExtraction = false; + int64_t AccumulativeByteOffset = 0; + gep_type_iterator GTI = gep_type_begin(*GEP); + for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { + if (isa<SequentialType>(*GTI)) { + // Tries to extract a constant offset from this GEP index. + int64_t ConstantOffset = + ConstantOffsetExtractor::Find(GEP->getOperand(I), DL, GEP); + if (ConstantOffset != 0) { + NeedsExtraction = true; + // A GEP may have multiple indices. We accumulate the extracted + // constant offset to a byte offset, and later offset the remainder of + // the original GEP with this byte offset. + AccumulativeByteOffset += + ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType()); + } + } + } + return AccumulativeByteOffset; +} + +bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { + // Skip vector GEPs. + if (GEP->getType()->isVectorTy()) + return false; + + // The backend can already nicely handle the case where all indices are + // constant. + if (GEP->hasAllConstantIndices()) + return false; + + bool Changed = canonicalizeArrayIndicesToPointerSize(GEP); + + bool NeedsExtraction; + int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction); + + if (!NeedsExtraction) + return Changed; + // Before really splitting the GEP, check whether the backend supports the + // addressing mode we are about to produce. If no, this splitting probably + // won't be beneficial. + TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); + if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(), + /*BaseGV=*/nullptr, AccumulativeByteOffset, + /*HasBaseReg=*/true, /*Scale=*/0)) { + return Changed; + } + + // Remove the constant offset in each GEP index. The resultant GEP computes + // the variadic base. + gep_type_iterator GTI = gep_type_begin(*GEP); + for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { + if (isa<SequentialType>(*GTI)) { + Value *NewIdx = nullptr; + // Tries to extract a constant offset from this GEP index. + int64_t ConstantOffset = + ConstantOffsetExtractor::Extract(GEP->getOperand(I), NewIdx, DL, GEP); + if (ConstantOffset != 0) { + assert(NewIdx != nullptr && + "ConstantOffset != 0 implies NewIdx is set"); + GEP->setOperand(I, NewIdx); + } + } + } + // Clear the inbounds attribute because the new index may be off-bound. + // e.g., + // + // b = add i64 a, 5 + // addr = gep inbounds float* p, i64 b + // + // is transformed to: + // + // addr2 = gep float* p, i64 a + // addr = gep float* addr2, i64 5 + // + // If a is -4, although the old index b is in bounds, the new index a is + // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the + // inbounds keyword is not present, the offsets are added to the base + // address with silently-wrapping two's complement arithmetic". + // Therefore, the final code will be a semantically equivalent. + // + // TODO(jingyue): do some range analysis to keep as many inbounds as + // possible. GEPs with inbounds are more friendly to alias analysis. + GEP->setIsInBounds(false); + + // Offsets the base with the accumulative byte offset. + // + // %gep ; the base + // ... %gep ... + // + // => add the offset + // + // %gep2 ; clone of %gep + // %new.gep = gep %gep2, <offset / sizeof(*%gep)> + // %gep ; will be removed + // ... %gep ... + // + // => replace all uses of %gep with %new.gep and remove %gep + // + // %gep2 ; clone of %gep + // %new.gep = gep %gep2, <offset / sizeof(*%gep)> + // ... %new.gep ... + // + // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an + // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep): + // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the + // type of %gep. + // + // %gep2 ; clone of %gep + // %0 = bitcast %gep2 to i8* + // %uglygep = gep %0, <offset> + // %new.gep = bitcast %uglygep to <type of %gep> + // ... %new.gep ... + Instruction *NewGEP = GEP->clone(); + NewGEP->insertBefore(GEP); + + uint64_t ElementTypeSizeOfGEP = + DL->getTypeAllocSize(GEP->getType()->getElementType()); + Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); + if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) { + // Very likely. As long as %gep is natually aligned, the byte offset we + // extracted should be a multiple of sizeof(*%gep). + // Per ANSI C standard, signed / unsigned = unsigned. Therefore, we + // cast ElementTypeSizeOfGEP to signed. + int64_t Index = + AccumulativeByteOffset / static_cast<int64_t>(ElementTypeSizeOfGEP); + NewGEP = GetElementPtrInst::Create( + NewGEP, ConstantInt::get(IntPtrTy, Index, true), GEP->getName(), GEP); + } else { + // Unlikely but possible. For example, + // #pragma pack(1) + // struct S { + // int a[3]; + // int64 b[8]; + // }; + // #pragma pack() + // + // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After + // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is + // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of + // sizeof(int64). + // + // Emit an uglygep in this case. + Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(), + GEP->getPointerAddressSpace()); + NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP); + NewGEP = GetElementPtrInst::Create( + NewGEP, ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), + "uglygep", GEP); + if (GEP->getType() != I8PtrTy) + NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP); + } + + GEP->replaceAllUsesWith(NewGEP); + GEP->eraseFromParent(); + + return true; +} + +bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { + if (DisableSeparateConstOffsetFromGEP) + return false; + + bool Changed = false; + for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { + for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) { + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) { + Changed |= splitGEP(GEP); + } + // No need to split GEP ConstantExprs because all its indices are constant + // already. + } + } + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 8371f6d..5d5606b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -21,23 +21,24 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "simplifycfg" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" -#include "llvm/Support/CFG.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "simplifycfg" + STATISTIC(NumSimpl, "Number of blocks simplified"); namespace { @@ -46,9 +47,9 @@ struct CFGSimplifyPass : public FunctionPass { CFGSimplifyPass() : FunctionPass(ID) { initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetTransformInfo>(); } }; @@ -71,7 +72,7 @@ FunctionPass *llvm::createCFGSimplificationPass() { static bool mergeEmptyReturnBlocks(Function &F) { bool Changed = false; - BasicBlock *RetBlock = 0; + BasicBlock *RetBlock = nullptr; // Scan all the blocks in the function, looking for empty return blocks. for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) { @@ -79,7 +80,7 @@ static bool mergeEmptyReturnBlocks(Function &F) { // Only look at return blocks. ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator()); - if (Ret == 0) continue; + if (!Ret) continue; // Only look at the block if it is empty or the only other thing in it is a // single PHI node that is the operand to the return. @@ -98,7 +99,7 @@ static bool mergeEmptyReturnBlocks(Function &F) { } // If this is the first returning block, remember it and keep going. - if (RetBlock == 0) { + if (!RetBlock) { RetBlock = &BB; continue; } @@ -119,7 +120,7 @@ static bool mergeEmptyReturnBlocks(Function &F) { // If the canonical return block has no PHI node, create one now. PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin()); - if (RetBlockPHI == 0) { + if (!RetBlockPHI) { Value *InVal = cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0); pred_iterator PB = pred_begin(RetBlock), PE = pred_end(RetBlock); RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(), @@ -145,7 +146,7 @@ static bool mergeEmptyReturnBlocks(Function &F) { /// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function, /// iterating until no more changes are made. static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, - const DataLayout *TD) { + const DataLayout *DL) { bool Changed = false; bool LocalChange = true; while (LocalChange) { @@ -154,7 +155,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, // Loop over all of the basic blocks and remove them if they are unneeded... // for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { - if (SimplifyCFG(BBIt++, TTI, TD)) { + if (SimplifyCFG(BBIt++, TTI, DL)) { LocalChange = true; ++NumSimpl; } @@ -168,11 +169,15 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, // simplify the CFG. // bool CFGSimplifyPass::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); - const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; bool EverChanged = removeUnreachableBlocks(F); EverChanged |= mergeEmptyReturnBlocks(F); - EverChanged |= iterativelySimplifyCFG(F, TTI, TD); + EverChanged |= iterativelySimplifyCFG(F, TTI, DL); // If neither pass changed anything, we're done. if (!EverChanged) return false; @@ -186,7 +191,7 @@ bool CFGSimplifyPass::runOnFunction(Function &F) { return true; do { - EverChanged = iterativelySimplifyCFG(F, TTI, TD); + EverChanged = iterativelySimplifyCFG(F, TTI, DL); EverChanged |= removeUnreachableBlocks(F); } while (EverChanged); diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp index d4595bb..7348c45c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp @@ -12,20 +12,21 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "sink" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Assembly/Writer.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; +#define DEBUG_TYPE "sink" + STATISTIC(NumSunk, "Number of instructions sunk"); STATISTIC(NumSinkIter, "Number of sinking iterations"); @@ -34,6 +35,7 @@ namespace { DominatorTree *DT; LoopInfo *LI; AliasAnalysis *AA; + const DataLayout *DL; public: static char ID; // Pass identification @@ -41,15 +43,15 @@ namespace { initializeSinkingPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); FunctionPass::getAnalysisUsage(AU); AU.addRequired<AliasAnalysis>(); - AU.addRequired<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfo>(); - AU.addPreserved<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<LoopInfo>(); } private: @@ -63,7 +65,7 @@ namespace { char Sinking::ID = 0; INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfo) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false) @@ -77,15 +79,14 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, // This may leave a referencing dbg_value in the original block, before // the definition of the vreg. Dwarf generator handles this although the // user might not get the right info at runtime. - for (Value::use_iterator I = Inst->use_begin(), - E = Inst->use_end(); I != E; ++I) { + for (Use &U : Inst->uses()) { // Determine the block of the use. - Instruction *UseInst = cast<Instruction>(*I); + Instruction *UseInst = cast<Instruction>(U.getUser()); BasicBlock *UseBlock = UseInst->getParent(); if (PHINode *PN = dyn_cast<PHINode>(UseInst)) { // PHI nodes use the operand in the predecessor block, not the block with // the PHI. - unsigned Num = PHINode::getIncomingValueNumForOperand(I.getOperandNo()); + unsigned Num = PHINode::getIncomingValueNumForOperand(U.getOperandNo()); UseBlock = PN->getIncomingBlock(Num); } // Check that it dominates. @@ -96,9 +97,11 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, } bool Sinking::runOnFunction(Function &F) { - DT = &getAnalysis<DominatorTree>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfo>(); AA = &getAnalysis<AliasAnalysis>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; bool MadeChange, EverMadeChange = false; @@ -194,7 +197,7 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst, if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) { // We cannot sink a load across a critical edge - there may be stores in // other code paths. - if (!isSafeToSpeculativelyExecute(Inst)) + if (!isSafeToSpeculativelyExecute(Inst, DL)) return false; // We don't want to sink across a critical edge if we don't dominate the @@ -205,7 +208,7 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst, // Don't sink instructions into a loop. Loop *succ = LI->getLoopFor(SuccToSinkTo); Loop *cur = LI->getLoopFor(Inst->getParent()); - if (succ != 0 && succ != cur) + if (succ != nullptr && succ != cur) return false; } @@ -218,6 +221,13 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst, /// instruction out of its current block into a successor. bool Sinking::SinkInstruction(Instruction *Inst, SmallPtrSet<Instruction *, 8> &Stores) { + + // Don't sink static alloca instructions. CodeGen assumes allocas outside the + // entry block are dynamically sized stack objects. + if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst)) + if (AI->isStaticAlloca()) + return false; + // Check if it's safe to move the instruction. if (!isSafeToMove(Inst, AA, Stores)) return false; @@ -232,14 +242,14 @@ bool Sinking::SinkInstruction(Instruction *Inst, // SuccToSinkTo - This is the successor to sink this instruction to, once we // decide. - BasicBlock *SuccToSinkTo = 0; + BasicBlock *SuccToSinkTo = nullptr; // Instructions can only be sunk if all their uses are in blocks // dominated by one of the successors. // Look at all the postdominators and see if we can sink it in one. DomTreeNode *DTN = DT->getNode(Inst->getParent()); for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end(); - I != E && SuccToSinkTo == 0; ++I) { + I != E && SuccToSinkTo == nullptr; ++I) { BasicBlock *Candidate = (*I)->getBlock(); if ((*I)->getIDom()->getBlock() == Inst->getParent() && IsAcceptableTarget(Inst, Candidate)) @@ -249,19 +259,19 @@ bool Sinking::SinkInstruction(Instruction *Inst, // If no suitable postdominator was found, look at all the successors and // decide which one we should sink to, if any. for (succ_iterator I = succ_begin(Inst->getParent()), - E = succ_end(Inst->getParent()); I != E && SuccToSinkTo == 0; ++I) { + E = succ_end(Inst->getParent()); I != E && !SuccToSinkTo; ++I) { if (IsAcceptableTarget(Inst, *I)) SuccToSinkTo = *I; } // If we couldn't find a block to sink to, ignore this instruction. - if (SuccToSinkTo == 0) + if (!SuccToSinkTo) return false; DEBUG(dbgs() << "Sink" << *Inst << " ("; - WriteAsOperand(dbgs(), Inst->getParent(), false); + Inst->getParent()->printAsOperand(dbgs(), false); dbgs() << " -> "; - WriteAsOperand(dbgs(), SuccToSinkTo, false); + SuccToSinkTo->printAsOperand(dbgs(), false); dbgs() << ")\n"); // Move the instruction. diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 5045ff8f..b9673ed 100644 --- a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -7,7 +7,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "structurizecfg" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SCCIterator.h" @@ -15,12 +14,14 @@ #include "llvm/Analysis/RegionIterator.h" #include "llvm/Analysis/RegionPass.h" #include "llvm/IR/Module.h" -#include "llvm/Support/PatternMatch.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; using namespace llvm::PatternMatch; +#define DEBUG_TYPE "structurizecfg" + namespace { // Definition of the complex types used in this pass. @@ -64,14 +65,14 @@ public: /// \brief Start a new query NearestCommonDominator(DominatorTree *DomTree) { DT = DomTree; - Result = 0; + Result = nullptr; } /// \brief Add BB to the resulting dominator void addBlock(BasicBlock *BB, bool Remember = true) { DomTreeNode *Node = DT->getNode(BB); - if (Result == 0) { + if (!Result) { unsigned Numbering = 0; for (;Node;Node = Node->getIDom()) IndexMap[Node] = ++Numbering; @@ -235,18 +236,18 @@ public: } using Pass::doInitialization; - virtual bool doInitialization(Region *R, RGPassManager &RGM); + bool doInitialization(Region *R, RGPassManager &RGM) override; - virtual bool runOnRegion(Region *R, RGPassManager &RGM); + bool runOnRegion(Region *R, RGPassManager &RGM) override; - virtual const char *getPassName() const { + const char *getPassName() const override { return "Structurize control flow"; } - void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequiredID(LowerSwitchID); - AU.addRequired<DominatorTree>(); - AU.addPreserved<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); RegionPass::getAnalysisUsage(AU); } }; @@ -258,8 +259,8 @@ char StructurizeCFG::ID = 0; INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG", false, false) INITIALIZE_PASS_DEPENDENCY(LowerSwitch) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) -INITIALIZE_PASS_DEPENDENCY(RegionInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(RegionInfoPass) INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG", false, false) @@ -277,10 +278,9 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) { /// \brief Build up the general order of nodes void StructurizeCFG::orderNodes() { - scc_iterator<Region *> I = scc_begin(ParentRegion), - E = scc_end(ParentRegion); - for (Order.clear(); I != E; ++I) { - std::vector<RegionNode *> &Nodes = *I; + scc_iterator<Region *> I = scc_begin(ParentRegion); + for (Order.clear(); !I.isAtEnd(); ++I) { + const std::vector<RegionNode *> &Nodes = *I; Order.append(Nodes.begin(), Nodes.end()); } } @@ -326,16 +326,10 @@ Value *StructurizeCFG::invert(Value *Condition) { if (Instruction *Inst = dyn_cast<Instruction>(Condition)) { // Third: Check all the users for an invert BasicBlock *Parent = Inst->getParent(); - for (Value::use_iterator I = Condition->use_begin(), - E = Condition->use_end(); I != E; ++I) { - - Instruction *User = dyn_cast<Instruction>(*I); - if (!User || User->getParent() != Parent) - continue; - - if (match(*I, m_Not(m_Specific(Condition)))) - return *I; - } + for (User *U : Condition->users()) + if (Instruction *I = dyn_cast<Instruction>(U)) + if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition)))) + return I; // Last option: Create a new instruction return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator()); @@ -412,11 +406,11 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) { } else { // It's an exit from a sub region - while(R->getParent() != ParentRegion) + while (R->getParent() != ParentRegion) R = R->getParent(); // Edge from inside a subregion to its entry, ignore it - if (R == N) + if (*R == *N) continue; BasicBlock *Entry = R->getEntry(); @@ -460,10 +454,7 @@ void StructurizeCFG::insertConditions(bool Loops) { Value *Default = Loops ? BoolTrue : BoolFalse; SSAUpdater PhiInserter; - for (BranchVector::iterator I = Conds.begin(), - E = Conds.end(); I != E; ++I) { - - BranchInst *Term = *I; + for (BranchInst *Term : Conds) { assert(Term->isConditional()); BasicBlock *Parent = Term->getParent(); @@ -479,7 +470,7 @@ void StructurizeCFG::insertConditions(bool Loops) { NearestCommonDominator Dominator(DT); Dominator.addBlock(Parent, false); - Value *ParentValue = 0; + Value *ParentValue = nullptr; for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); PI != PE; ++PI) { @@ -598,7 +589,7 @@ void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit, if (Node->isSubRegion()) { Region *SubRegion = Node->getNodeAs<Region>(); BasicBlock *OldExit = SubRegion->getExit(); - BasicBlock *Dominator = 0; + BasicBlock *Dominator = nullptr; // Find all the edges from the sub region to the exit for (pred_iterator I = pred_begin(OldExit), E = pred_end(OldExit); @@ -685,7 +676,8 @@ BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow, /// \brief Set the previous node void StructurizeCFG::setPrevNode(BasicBlock *BB) { - PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB) : 0; + PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB) + : nullptr; } /// \brief Does BB dominate all the predicates of Node ? @@ -706,7 +698,7 @@ bool StructurizeCFG::isPredictableTrue(RegionNode *Node) { bool Dominated = false; // Regionentry is always true - if (PrevNode == 0) + if (!PrevNode) return true; for (BBPredicates::iterator I = Preds.begin(), E = Preds.end(); @@ -813,11 +805,11 @@ void StructurizeCFG::createFlow() { Conditions.clear(); LoopConds.clear(); - PrevNode = 0; + PrevNode = nullptr; Visited.clear(); while (!Order.empty()) { - handleLoops(EntryDominatesExit, 0); + handleLoops(EntryDominatesExit, nullptr); } if (PrevNode) @@ -830,25 +822,19 @@ void StructurizeCFG::createFlow() { /// no longer dominate all their uses. Not sure if this is really nessasary void StructurizeCFG::rebuildSSA() { SSAUpdater Updater; - for (Region::block_iterator I = ParentRegion->block_begin(), - E = ParentRegion->block_end(); - I != E; ++I) { - - BasicBlock *BB = *I; + for (const auto &BB : ParentRegion->blocks()) for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE; ++II) { bool Initialized = false; - for (Use *I = &II->use_begin().getUse(), *Next; I; I = Next) { - - Next = I->getNext(); - - Instruction *User = cast<Instruction>(I->getUser()); + for (auto I = II->use_begin(), E = II->use_end(); I != E;) { + Use &U = *I++; + Instruction *User = cast<Instruction>(U.getUser()); if (User->getParent() == BB) { continue; } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) { - if (UserPN->getIncomingBlock(*I) == BB) + if (UserPN->getIncomingBlock(U) == BB) continue; } @@ -862,10 +848,9 @@ void StructurizeCFG::rebuildSSA() { Updater.AddAvailableValue(BB, II); Initialized = true; } - Updater.RewriteUseAfterInsertions(*I); + Updater.RewriteUseAfterInsertions(U); } } - } } /// \brief Run the transformation for each region found @@ -876,7 +861,7 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { Func = R->getEntry()->getParent(); ParentRegion = R; - DT = &getAnalysis<DominatorTree>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); orderNodes(); collectInfos(); diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index 9fb8ddc..b758025 100644 --- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -50,32 +50,35 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "tailcallelim" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "tailcallelim" + STATISTIC(NumEliminated, "Number of tail calls removed"); STATISTIC(NumRetDuped, "Number of return duplicated"); STATISTIC(NumAccumAdded, "Number of accumulators introduced"); @@ -89,11 +92,14 @@ namespace { initializeTailCallElimPass(*PassRegistry::getPassRegistry()); } - virtual void getAnalysisUsage(AnalysisUsage &AU) const; + void getAnalysisUsage(AnalysisUsage &AU) const override; - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; private: + bool runTRE(Function &F); + bool markTails(Function &F, bool &AllCallsAreTailCalls); + CallInst *FindTRECandidate(Instruction *I, bool CannotTailCallElimCallsMarkedTail); bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, @@ -131,52 +137,253 @@ void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<TargetTransformInfo>(); } -/// CanTRE - Scan the specified basic block for alloca instructions. -/// If it contains any that are variable-sized or not in the entry block, -/// returns false. -static bool CanTRE(AllocaInst *AI) { - // Because of PR962, we don't TRE allocas outside the entry block. - - // If this alloca is in the body of the function, or if it is a variable - // sized allocation, we cannot tail call eliminate calls marked 'tail' - // with this mechanism. - BasicBlock *BB = AI->getParent(); - return BB == &BB->getParent()->getEntryBlock() && - isa<ConstantInt>(AI->getArraySize()); +/// \brief Scan the specified function for alloca instructions. +/// If it contains any dynamic allocas, returns false. +static bool CanTRE(Function &F) { + // Because of PR962, we don't TRE dynamic allocas. + for (auto &BB : F) { + for (auto &I : BB) { + if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) { + if (!AI->isStaticAlloca()) + return false; + } + } + } + + return true; } -namespace { -struct AllocaCaptureTracker : public CaptureTracker { - AllocaCaptureTracker() : Captured(false) {} +bool TailCallElim::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; - void tooManyUses() LLVM_OVERRIDE { Captured = true; } + bool AllCallsAreTailCalls = false; + bool Modified = markTails(F, AllCallsAreTailCalls); + if (AllCallsAreTailCalls) + Modified |= runTRE(F); + return Modified; +} - bool shouldExplore(Use *U) LLVM_OVERRIDE { - Value *V = U->getUser(); - if (isa<CallInst>(V) || isa<InvokeInst>(V)) - UsesAlloca.insert(V); - return true; +namespace { +struct AllocaDerivedValueTracker { + // Start at a root value and walk its use-def chain to mark calls that use the + // value or a derived value in AllocaUsers, and places where it may escape in + // EscapePoints. + void walk(Value *Root) { + SmallVector<Use *, 32> Worklist; + SmallPtrSet<Use *, 32> Visited; + + auto AddUsesToWorklist = [&](Value *V) { + for (auto &U : V->uses()) { + if (!Visited.insert(&U)) + continue; + Worklist.push_back(&U); + } + }; + + AddUsesToWorklist(Root); + + while (!Worklist.empty()) { + Use *U = Worklist.pop_back_val(); + Instruction *I = cast<Instruction>(U->getUser()); + + switch (I->getOpcode()) { + case Instruction::Call: + case Instruction::Invoke: { + CallSite CS(I); + bool IsNocapture = !CS.isCallee(U) && + CS.doesNotCapture(CS.getArgumentNo(U)); + callUsesLocalStack(CS, IsNocapture); + if (IsNocapture) { + // If the alloca-derived argument is passed in as nocapture, then it + // can't propagate to the call's return. That would be capturing. + continue; + } + break; + } + case Instruction::Load: { + // The result of a load is not alloca-derived (unless an alloca has + // otherwise escaped, but this is a local analysis). + continue; + } + case Instruction::Store: { + if (U->getOperandNo() == 0) + EscapePoints.insert(I); + continue; // Stores have no users to analyze. + } + case Instruction::BitCast: + case Instruction::GetElementPtr: + case Instruction::PHI: + case Instruction::Select: + case Instruction::AddrSpaceCast: + break; + default: + EscapePoints.insert(I); + break; + } + + AddUsesToWorklist(I); + } } - bool captured(Use *U) LLVM_OVERRIDE { - if (isa<ReturnInst>(U->getUser())) - return false; - Captured = true; - return true; + void callUsesLocalStack(CallSite CS, bool IsNocapture) { + // Add it to the list of alloca users. + AllocaUsers.insert(CS.getInstruction()); + + // If it's nocapture then it can't capture this alloca. + if (IsNocapture) + return; + + // If it can write to memory, it can leak the alloca value. + if (!CS.onlyReadsMemory()) + EscapePoints.insert(CS.getInstruction()); } - bool Captured; - SmallPtrSet<const Value *, 16> UsesAlloca; + SmallPtrSet<Instruction *, 32> AllocaUsers; + SmallPtrSet<Instruction *, 32> EscapePoints; }; -} // end anonymous namespace +} -bool TailCallElim::runOnFunction(Function &F) { +bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) { + if (F.callsFunctionThatReturnsTwice()) + return false; + AllCallsAreTailCalls = true; + + // The local stack holds all alloca instructions and all byval arguments. + AllocaDerivedValueTracker Tracker; + for (Argument &Arg : F.args()) { + if (Arg.hasByValAttr()) + Tracker.walk(&Arg); + } + for (auto &BB : F) { + for (auto &I : BB) + if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) + Tracker.walk(AI); + } + + bool Modified = false; + + // Track whether a block is reachable after an alloca has escaped. Blocks that + // contain the escaping instruction will be marked as being visited without an + // escaped alloca, since that is how the block began. + enum VisitType { + UNVISITED, + UNESCAPED, + ESCAPED + }; + DenseMap<BasicBlock *, VisitType> Visited; + + // We propagate the fact that an alloca has escaped from block to successor. + // Visit the blocks that are propagating the escapedness first. To do this, we + // maintain two worklists. + SmallVector<BasicBlock *, 32> WorklistUnescaped, WorklistEscaped; + + // We may enter a block and visit it thinking that no alloca has escaped yet, + // then see an escape point and go back around a loop edge and come back to + // the same block twice. Because of this, we defer setting tail on calls when + // we first encounter them in a block. Every entry in this list does not + // statically use an alloca via use-def chain analysis, but may find an alloca + // through other means if the block turns out to be reachable after an escape + // point. + SmallVector<CallInst *, 32> DeferredTails; + + BasicBlock *BB = &F.getEntryBlock(); + VisitType Escaped = UNESCAPED; + do { + for (auto &I : *BB) { + if (Tracker.EscapePoints.count(&I)) + Escaped = ESCAPED; + + CallInst *CI = dyn_cast<CallInst>(&I); + if (!CI || CI->isTailCall()) + continue; + + if (CI->doesNotAccessMemory()) { + // A call to a readnone function whose arguments are all things computed + // outside this function can be marked tail. Even if you stored the + // alloca address into a global, a readnone function can't load the + // global anyhow. + // + // Note that this runs whether we know an alloca has escaped or not. If + // it has, then we can't trust Tracker.AllocaUsers to be accurate. + bool SafeToTail = true; + for (auto &Arg : CI->arg_operands()) { + if (isa<Constant>(Arg.getUser())) + continue; + if (Argument *A = dyn_cast<Argument>(Arg.getUser())) + if (!A->hasByValAttr()) + continue; + SafeToTail = false; + break; + } + if (SafeToTail) { + emitOptimizationRemark( + F.getContext(), "tailcallelim", F, CI->getDebugLoc(), + "marked this readnone call a tail call candidate"); + CI->setTailCall(); + Modified = true; + continue; + } + } + + if (Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) { + DeferredTails.push_back(CI); + } else { + AllCallsAreTailCalls = false; + } + } + + for (auto *SuccBB : make_range(succ_begin(BB), succ_end(BB))) { + auto &State = Visited[SuccBB]; + if (State < Escaped) { + State = Escaped; + if (State == ESCAPED) + WorklistEscaped.push_back(SuccBB); + else + WorklistUnescaped.push_back(SuccBB); + } + } + + if (!WorklistEscaped.empty()) { + BB = WorklistEscaped.pop_back_val(); + Escaped = ESCAPED; + } else { + BB = nullptr; + while (!WorklistUnescaped.empty()) { + auto *NextBB = WorklistUnescaped.pop_back_val(); + if (Visited[NextBB] == UNESCAPED) { + BB = NextBB; + Escaped = UNESCAPED; + break; + } + } + } + } while (BB); + + for (CallInst *CI : DeferredTails) { + if (Visited[CI->getParent()] != ESCAPED) { + // If the escape point was part way through the block, calls after the + // escape point wouldn't have been put into DeferredTails. + emitOptimizationRemark(F.getContext(), "tailcallelim", F, + CI->getDebugLoc(), + "marked this call a tail call candidate"); + CI->setTailCall(); + Modified = true; + } else { + AllCallsAreTailCalls = false; + } + } + + return Modified; +} + +bool TailCallElim::runTRE(Function &F) { // If this function is a varargs function, we won't be able to PHI the args // right, so don't even try to convert it... if (F.getFunctionType()->isVarArg()) return false; TTI = &getAnalysis<TargetTransformInfo>(); - BasicBlock *OldEntry = 0; + BasicBlock *OldEntry = nullptr; bool TailCallsAreMarkedTail = false; SmallVector<PHINode*, 8> ArgumentPHIs; bool MadeChange = false; @@ -185,39 +392,23 @@ bool TailCallElim::runOnFunction(Function &F) { // marked with the 'tail' attribute, because doing so would cause the stack // size to increase (real TRE would deallocate variable sized allocas, TRE // doesn't). - bool CanTRETailMarkedCall = true; - - // Find calls that can be marked tail. - AllocaCaptureTracker ACT; - for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB) { - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { - if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { - CanTRETailMarkedCall &= CanTRE(AI); - PointerMayBeCaptured(AI, &ACT); - // If any allocas are captured, exit. - if (ACT.Captured) - return false; - } - } - } + bool CanTRETailMarkedCall = CanTRE(F); - // Second pass, change any tail recursive calls to loops. + // Change any tail recursive calls to loops. // // FIXME: The code generator produces really bad code when an 'escaping // alloca' is changed from being a static alloca to being a dynamic alloca. // Until this is resolved, disable this transformation if that would ever // happen. This bug is PR962. - if (ACT.UsesAlloca.empty()) { - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { - bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs, !CanTRETailMarkedCall); - if (!Change && BB->getFirstNonPHIOrDbg() == Ret) - Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, - TailCallsAreMarkedTail, ArgumentPHIs, - !CanTRETailMarkedCall); - MadeChange |= Change; - } + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { + bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, + ArgumentPHIs, !CanTRETailMarkedCall); + if (!Change && BB->getFirstNonPHIOrDbg() == Ret) + Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, + TailCallsAreMarkedTail, ArgumentPHIs, + !CanTRETailMarkedCall); + MadeChange |= Change; } } @@ -226,34 +417,13 @@ bool TailCallElim::runOnFunction(Function &F) { // with themselves. Check to see if we did and clean up our mess if so. This // occurs when a function passes an argument straight through to its tail // call. - if (!ArgumentPHIs.empty()) { - for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) { - PHINode *PN = ArgumentPHIs[i]; - - // If the PHI Node is a dynamic constant, replace it with the value it is. - if (Value *PNV = SimplifyInstruction(PN)) { - PN->replaceAllUsesWith(PNV); - PN->eraseFromParent(); - } - } - } + for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) { + PHINode *PN = ArgumentPHIs[i]; - // At this point, we know that the function does not have any captured - // allocas. If additionally the function does not call setjmp, mark all calls - // in the function that do not access stack memory with the tail keyword. This - // implies ensuring that there does not exist any path from a call that takes - // in an alloca but does not capture it and the call which we wish to mark - // with "tail". - if (!F.callsFunctionThatReturnsTwice()) { - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { - if (CallInst *CI = dyn_cast<CallInst>(I)) { - if (!ACT.UsesAlloca.count(CI)) { - CI->setTailCall(); - MadeChange = true; - } - } - } + // If the PHI Node is a dynamic constant, replace it with the value it is. + if (Value *PNV = SimplifyInstruction(PN)) { + PN->replaceAllUsesWith(PNV); + PN->eraseFromParent(); } } @@ -340,11 +510,11 @@ static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) { // static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) { Function *F = CI->getParent()->getParent(); - Value *ReturnedValue = 0; + Value *ReturnedValue = nullptr; for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI) { ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()); - if (RI == 0 || RI == IgnoreRI) continue; + if (RI == nullptr || RI == IgnoreRI) continue; // We can only perform this transformation if the value returned is // evaluatable at the start of the initial invocation of the function, @@ -352,10 +522,10 @@ static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) { // Value *RetOp = RI->getOperand(0); if (!isDynamicConstant(RetOp, CI, RI)) - return 0; + return nullptr; if (ReturnedValue && RetOp != ReturnedValue) - return 0; // Cannot transform if differing values are returned. + return nullptr; // Cannot transform if differing values are returned. ReturnedValue = RetOp; } return ReturnedValue; @@ -367,23 +537,23 @@ static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) { /// Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI) { - if (!I->isAssociative() || !I->isCommutative()) return 0; + if (!I->isAssociative() || !I->isCommutative()) return nullptr; assert(I->getNumOperands() == 2 && "Associative/commutative operations should have 2 args!"); // Exactly one operand should be the result of the call instruction. if ((I->getOperand(0) == CI && I->getOperand(1) == CI) || (I->getOperand(0) != CI && I->getOperand(1) != CI)) - return 0; + return nullptr; // The only user of this instruction we allow is a single return instruction. - if (!I->hasOneUse() || !isa<ReturnInst>(I->use_back())) - return 0; + if (!I->hasOneUse() || !isa<ReturnInst>(I->user_back())) + return nullptr; // Ok, now we have to check all of the other return instructions in this // function. If they return non-constants or differing values, then we cannot // transform the function safely. - return getCommonReturnValue(cast<ReturnInst>(I->use_back()), CI); + return getCommonReturnValue(cast<ReturnInst>(I->user_back()), CI); } static Instruction *FirstNonDbg(BasicBlock::iterator I) { @@ -399,11 +569,11 @@ TailCallElim::FindTRECandidate(Instruction *TI, Function *F = BB->getParent(); if (&BB->front() == TI) // Make sure there is something before the terminator. - return 0; + return nullptr; // Scan backwards from the return, checking to see if there is a tail call in // this block. If so, set CI to it. - CallInst *CI = 0; + CallInst *CI = nullptr; BasicBlock::iterator BBI = TI; while (true) { CI = dyn_cast<CallInst>(BBI); @@ -411,14 +581,14 @@ TailCallElim::FindTRECandidate(Instruction *TI, break; if (BBI == BB->begin()) - return 0; // Didn't find a potential tail call. + return nullptr; // Didn't find a potential tail call. --BBI; } // If this call is marked as a tail call, and if there are dynamic allocas in // the function, we cannot perform this optimization. if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail) - return 0; + return nullptr; // As a special case, detect code like this: // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call @@ -426,7 +596,7 @@ TailCallElim::FindTRECandidate(Instruction *TI, // lower the call to fabs into inline code. if (BB == &F->getEntryBlock() && FirstNonDbg(BB->front()) == CI && - FirstNonDbg(llvm::next(BB->begin())) == TI && + FirstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() && !TTI->isLoweredToCall(CI->getCalledFunction())) { // A single-block function with just a call and a return. Check that @@ -438,7 +608,7 @@ TailCallElim::FindTRECandidate(Instruction *TI, for (; I != E && FI != FE; ++I, ++FI) if (*I != &*FI) break; if (I == E && FI == FE) - return 0; + return nullptr; } return CI; @@ -459,8 +629,8 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, // which is different to the constant returned by other return instructions // (which is recorded in AccumulatorRecursionEliminationInitVal). This is a // special case of accumulator recursion, the operation being "return C". - Value *AccumulatorRecursionEliminationInitVal = 0; - Instruction *AccumulatorRecursionInstr = 0; + Value *AccumulatorRecursionEliminationInitVal = nullptr; + Instruction *AccumulatorRecursionInstr = nullptr; // Ok, we found a potential tail call. We can currently only transform the // tail call if all of the instructions between the call and the return are @@ -490,8 +660,8 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, // accumulator recursion variable eliminated. if (Ret->getNumOperands() == 1 && Ret->getReturnValue() != CI && !isa<UndefValue>(Ret->getReturnValue()) && - AccumulatorRecursionEliminationInitVal == 0 && - !getCommonReturnValue(0, CI)) { + AccumulatorRecursionEliminationInitVal == nullptr && + !getCommonReturnValue(nullptr, CI)) { // One case remains that we are able to handle: the current return // instruction returns a constant, and all other return instructions // return a different constant. @@ -507,9 +677,12 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock *BB = Ret->getParent(); Function *F = BB->getParent(); + emitOptimizationRemark(F->getContext(), "tailcallelim", *F, CI->getDebugLoc(), + "transforming tail recursion to loop"); + // OK! We can transform this tail call. If this is the first one found, // create the new entry block, allowing us to branch back to the old entry. - if (OldEntry == 0) { + if (!OldEntry) { OldEntry = &F->getEntryBlock(); BasicBlock *NewEntry = BasicBlock::Create(F->getContext(), "", F, OldEntry); NewEntry->takeName(OldEntry); diff --git a/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp b/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp new file mode 100644 index 0000000..cce016a --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp @@ -0,0 +1,114 @@ +//===-- ASanStackFrameLayout.cpp - helper for AddressSanitizer ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Definition of ComputeASanStackFrameLayout (see ASanStackFrameLayout.h). +// +//===----------------------------------------------------------------------===// +#include "llvm/Transforms/Utils/ASanStackFrameLayout.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> + +namespace llvm { + +// We sort the stack variables by alignment (largest first) to minimize +// unnecessary large gaps due to alignment. +// It is tempting to also sort variables by size so that larger variables +// have larger redzones at both ends. But reordering will make report analysis +// harder, especially when temporary unnamed variables are present. +// So, until we can provide more information (type, line number, etc) +// for the stack variables we avoid reordering them too much. +static inline bool CompareVars(const ASanStackVariableDescription &a, + const ASanStackVariableDescription &b) { + return a.Alignment > b.Alignment; +} + +// We also force minimal alignment for all vars to kMinAlignment so that vars +// with e.g. alignment 1 and alignment 16 do not get reordered by CompareVars. +static const size_t kMinAlignment = 16; + +static size_t RoundUpTo(size_t X, size_t RoundTo) { + assert((RoundTo & (RoundTo - 1)) == 0); + return (X + RoundTo - 1) & ~(RoundTo - 1); +} + +// The larger the variable Size the larger is the redzone. +// The resulting frame size is a multiple of Alignment. +static size_t VarAndRedzoneSize(size_t Size, size_t Alignment) { + size_t Res = 0; + if (Size <= 4) Res = 16; + else if (Size <= 16) Res = 32; + else if (Size <= 128) Res = Size + 32; + else if (Size <= 512) Res = Size + 64; + else if (Size <= 4096) Res = Size + 128; + else Res = Size + 256; + return RoundUpTo(Res, Alignment); +} + +void +ComputeASanStackFrameLayout(SmallVectorImpl<ASanStackVariableDescription> &Vars, + size_t Granularity, size_t MinHeaderSize, + ASanStackFrameLayout *Layout) { + assert(Granularity >= 8 && Granularity <= 64 && + (Granularity & (Granularity - 1)) == 0); + assert(MinHeaderSize >= 16 && (MinHeaderSize & (MinHeaderSize - 1)) == 0 && + MinHeaderSize >= Granularity); + size_t NumVars = Vars.size(); + assert(NumVars > 0); + for (size_t i = 0; i < NumVars; i++) + Vars[i].Alignment = std::max(Vars[i].Alignment, kMinAlignment); + + std::stable_sort(Vars.begin(), Vars.end(), CompareVars); + SmallString<2048> StackDescriptionStorage; + raw_svector_ostream StackDescription(StackDescriptionStorage); + StackDescription << NumVars; + Layout->FrameAlignment = std::max(Granularity, Vars[0].Alignment); + SmallVector<uint8_t, 64> &SB(Layout->ShadowBytes); + SB.clear(); + size_t Offset = std::max(std::max(MinHeaderSize, Granularity), + Vars[0].Alignment); + assert((Offset % Granularity) == 0); + SB.insert(SB.end(), Offset / Granularity, kAsanStackLeftRedzoneMagic); + for (size_t i = 0; i < NumVars; i++) { + bool IsLast = i == NumVars - 1; + size_t Alignment = std::max(Granularity, Vars[i].Alignment); + (void)Alignment; // Used only in asserts. + size_t Size = Vars[i].Size; + const char *Name = Vars[i].Name; + assert((Alignment & (Alignment - 1)) == 0); + assert(Layout->FrameAlignment >= Alignment); + assert((Offset % Alignment) == 0); + assert(Size > 0); + StackDescription << " " << Offset << " " << Size << " " << strlen(Name) + << " " << Name; + size_t NextAlignment = IsLast ? Granularity + : std::max(Granularity, Vars[i + 1].Alignment); + size_t SizeWithRedzone = VarAndRedzoneSize(Vars[i].Size, NextAlignment); + SB.insert(SB.end(), Size / Granularity, 0); + if (Size % Granularity) + SB.insert(SB.end(), Size % Granularity); + SB.insert(SB.end(), (SizeWithRedzone - Size) / Granularity, + IsLast ? kAsanStackRightRedzoneMagic + : kAsanStackMidRedzoneMagic); + Vars[i].Offset = Offset; + Offset += SizeWithRedzone; + } + if (Offset % MinHeaderSize) { + size_t ExtraRedzone = MinHeaderSize - (Offset % MinHeaderSize); + SB.insert(SB.end(), ExtraRedzone / Granularity, + kAsanStackRightRedzoneMagic); + Offset += ExtraRedzone; + } + Layout->DescriptionString = StackDescription.str(); + Layout->FrameSize = Offset; + assert((Layout->FrameSize % MinHeaderSize) == 0); + assert(Layout->FrameSize / Granularity == Layout->ShadowBytes.size()); +} + +} // llvm namespace diff --git a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp new file mode 100644 index 0000000..196ac79 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp @@ -0,0 +1,222 @@ +//===- AddDiscriminators.cpp - Insert DWARF path discriminators -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file adds DWARF discriminators to the IR. Path discriminators are +// used to decide what CFG path was taken inside sub-graphs whose instructions +// share the same line and column number information. +// +// The main user of this is the sample profiler. Instruction samples are +// mapped to line number information. Since a single line may be spread +// out over several basic blocks, discriminators add more precise location +// for the samples. +// +// For example, +// +// 1 #define ASSERT(P) +// 2 if (!(P)) +// 3 abort() +// ... +// 100 while (true) { +// 101 ASSERT (sum < 0); +// 102 ... +// 130 } +// +// when converted to IR, this snippet looks something like: +// +// while.body: ; preds = %entry, %if.end +// %0 = load i32* %sum, align 4, !dbg !15 +// %cmp = icmp slt i32 %0, 0, !dbg !15 +// br i1 %cmp, label %if.end, label %if.then, !dbg !15 +// +// if.then: ; preds = %while.body +// call void @abort(), !dbg !15 +// br label %if.end, !dbg !15 +// +// Notice that all the instructions in blocks 'while.body' and 'if.then' +// have exactly the same debug information. When this program is sampled +// at runtime, the profiler will assume that all these instructions are +// equally frequent. This, in turn, will consider the edge while.body->if.then +// to be frequently taken (which is incorrect). +// +// By adding a discriminator value to the instructions in block 'if.then', +// we can distinguish instructions at line 101 with discriminator 0 from +// the instructions at line 101 with discriminator 1. +// +// For more details about DWARF discriminators, please visit +// http://wiki.dwarfstd.org/index.php?title=Path_Discriminators +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DIBuilder.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "add-discriminators" + +namespace { + struct AddDiscriminators : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + AddDiscriminators() : FunctionPass(ID) { + initializeAddDiscriminatorsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + }; +} + +char AddDiscriminators::ID = 0; +INITIALIZE_PASS_BEGIN(AddDiscriminators, "add-discriminators", + "Add DWARF path discriminators", false, false) +INITIALIZE_PASS_END(AddDiscriminators, "add-discriminators", + "Add DWARF path discriminators", false, false) + +// Command line option to disable discriminator generation even in the +// presence of debug information. This is only needed when debugging +// debug info generation issues. +static cl::opt<bool> +NoDiscriminators("no-discriminators", cl::init(false), + cl::desc("Disable generation of discriminator information.")); + +FunctionPass *llvm::createAddDiscriminatorsPass() { + return new AddDiscriminators(); +} + +static bool hasDebugInfo(const Function &F) { + NamedMDNode *CUNodes = F.getParent()->getNamedMetadata("llvm.dbg.cu"); + return CUNodes != nullptr; +} + +/// \brief Assign DWARF discriminators. +/// +/// To assign discriminators, we examine the boundaries of every +/// basic block and its successors. Suppose there is a basic block B1 +/// with successor B2. The last instruction I1 in B1 and the first +/// instruction I2 in B2 are located at the same file and line number. +/// This situation is illustrated in the following code snippet: +/// +/// if (i < 10) x = i; +/// +/// entry: +/// br i1 %cmp, label %if.then, label %if.end, !dbg !10 +/// if.then: +/// %1 = load i32* %i.addr, align 4, !dbg !10 +/// store i32 %1, i32* %x, align 4, !dbg !10 +/// br label %if.end, !dbg !10 +/// if.end: +/// ret void, !dbg !12 +/// +/// Notice how the branch instruction in block 'entry' and all the +/// instructions in block 'if.then' have the exact same debug location +/// information (!dbg !10). +/// +/// To distinguish instructions in block 'entry' from instructions in +/// block 'if.then', we generate a new lexical block for all the +/// instruction in block 'if.then' that share the same file and line +/// location with the last instruction of block 'entry'. +/// +/// This new lexical block will have the same location information as +/// the previous one, but with a new DWARF discriminator value. +/// +/// One of the main uses of this discriminator value is in runtime +/// sample profilers. It allows the profiler to distinguish instructions +/// at location !dbg !10 that execute on different basic blocks. This is +/// important because while the predicate 'if (x < 10)' may have been +/// executed millions of times, the assignment 'x = i' may have only +/// executed a handful of times (meaning that the entry->if.then edge is +/// seldom taken). +/// +/// If we did not have discriminator information, the profiler would +/// assign the same weight to both blocks 'entry' and 'if.then', which +/// in turn will make it conclude that the entry->if.then edge is very +/// hot. +/// +/// To decide where to create new discriminator values, this function +/// traverses the CFG and examines instruction at basic block boundaries. +/// If the last instruction I1 of a block B1 is at the same file and line +/// location as instruction I2 of successor B2, then it creates a new +/// lexical block for I2 and all the instruction in B2 that share the same +/// file and line location as I2. This new lexical block will have a +/// different discriminator number than I1. +bool AddDiscriminators::runOnFunction(Function &F) { + // If the function has debug information, but the user has disabled + // discriminators, do nothing. + // Simlarly, if the function has no debug info, do nothing. + // Finally, if this module is built with dwarf versions earlier than 4, + // do nothing (discriminator support is a DWARF 4 feature). + if (NoDiscriminators || + !hasDebugInfo(F) || + F.getParent()->getDwarfVersion() < 4) + return false; + + bool Changed = false; + Module *M = F.getParent(); + LLVMContext &Ctx = M->getContext(); + DIBuilder Builder(*M); + + // Traverse all the blocks looking for instructions in different + // blocks that are at the same file:line location. + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { + BasicBlock *B = I; + TerminatorInst *Last = B->getTerminator(); + DebugLoc LastLoc = Last->getDebugLoc(); + if (LastLoc.isUnknown()) continue; + DILocation LastDIL(LastLoc.getAsMDNode(Ctx)); + + for (unsigned I = 0; I < Last->getNumSuccessors(); ++I) { + BasicBlock *Succ = Last->getSuccessor(I); + Instruction *First = Succ->getFirstNonPHIOrDbgOrLifetime(); + DebugLoc FirstLoc = First->getDebugLoc(); + if (FirstLoc.isUnknown()) continue; + DILocation FirstDIL(FirstLoc.getAsMDNode(Ctx)); + + // If the first instruction (First) of Succ is at the same file + // location as B's last instruction (Last), add a new + // discriminator for First's location and all the instructions + // in Succ that share the same location with First. + if (FirstDIL.atSameLineAs(LastDIL)) { + // Create a new lexical scope and compute a new discriminator + // number for it. + StringRef Filename = FirstDIL.getFilename(); + unsigned LineNumber = FirstDIL.getLineNumber(); + unsigned ColumnNumber = FirstDIL.getColumnNumber(); + DIScope Scope = FirstDIL.getScope(); + DIFile File = Builder.createFile(Filename, Scope.getDirectory()); + unsigned Discriminator = FirstDIL.computeNewDiscriminator(Ctx); + DILexicalBlock NewScope = Builder.createLexicalBlock( + Scope, File, LineNumber, ColumnNumber, Discriminator); + DILocation NewDIL = FirstDIL.copyWithNewScope(Ctx, NewScope); + DebugLoc newDebugLoc = DebugLoc::getFromDILocation(NewDIL); + + // Attach this new debug location to First and every + // instruction following First that shares the same location. + for (BasicBlock::iterator I1(*First), E1 = Succ->end(); I1 != E1; + ++I1) { + if (I1->getDebugLoc() != FirstLoc) break; + I1->setDebugLoc(newDebugLoc); + DEBUG(dbgs() << NewDIL.getFilename() << ":" << NewDIL.getLineNumber() + << ":" << NewDIL.getColumnNumber() << ":" + << NewDIL.getDiscriminator() << *I1 << "\n"); + } + DEBUG(dbgs() << "\n"); + Changed = true; + } + } + } + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 12de9ee..602e8ba 100644 --- a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -15,17 +15,17 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/IR/Constant.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Type.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/ValueHandle.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> @@ -68,8 +68,8 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) { void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, Pass *P) { if (!isa<PHINode>(BB->begin())) return; - AliasAnalysis *AA = 0; - MemoryDependenceAnalysis *MemDep = 0; + AliasAnalysis *AA = nullptr; + MemoryDependenceAnalysis *MemDep = nullptr; if (P) { AA = P->getAnalysisIfAvailable<AliasAnalysis>(); MemDep = P->getAnalysisIfAvailable<MemoryDependenceAnalysis>(); @@ -130,7 +130,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) { BasicBlock *OnlySucc = BB; for (; SI != SE; ++SI) if (*SI != OnlySucc) { - OnlySucc = 0; // There are multiple distinct successors! + OnlySucc = nullptr; // There are multiple distinct successors! break; } @@ -167,15 +167,17 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) { // Finally, erase the old block and update dominator info. if (P) { - if (DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>()) { - if (DomTreeNode *DTN = DT->getNode(BB)) { - DomTreeNode *PredDTN = DT->getNode(PredBB); + if (DominatorTreeWrapperPass *DTWP = + P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { + DominatorTree &DT = DTWP->getDomTree(); + if (DomTreeNode *DTN = DT.getNode(BB)) { + DomTreeNode *PredDTN = DT.getNode(PredBB); SmallVector<DomTreeNode*, 8> Children(DTN->begin(), DTN->end()); for (SmallVectorImpl<DomTreeNode *>::iterator DI = Children.begin(), DE = Children.end(); DI != DE; ++DI) - DT->changeImmediateDominator(*DI, PredDTN); + DT.changeImmediateDominator(*DI, PredDTN); - DT->eraseNode(BB); + DT.eraseNode(BB); } if (LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>()) @@ -215,7 +217,7 @@ void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL, /// void llvm::ReplaceInstWithInst(BasicBlock::InstListType &BIL, BasicBlock::iterator &BI, Instruction *I) { - assert(I->getParent() == 0 && + assert(I->getParent() == nullptr && "ReplaceInstWithInst: Instruction already inserted into basic block!"); // Insert the new instruction into the basic block... @@ -252,7 +254,7 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) { // If the successor only has a single pred, split the top of the successor // block. assert(SP == BB && "CFG broken"); - SP = NULL; + SP = nullptr; return SplitBlock(Succ, Succ->begin(), P); } @@ -280,18 +282,20 @@ BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) { if (Loop *L = LI->getLoopFor(Old)) L->addBasicBlockToLoop(New, LI->getBase()); - if (DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>()) { + if (DominatorTreeWrapperPass *DTWP = + P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { + DominatorTree &DT = DTWP->getDomTree(); // Old dominates New. New node dominates all other nodes dominated by Old. - if (DomTreeNode *OldNode = DT->getNode(Old)) { + if (DomTreeNode *OldNode = DT.getNode(Old)) { std::vector<DomTreeNode *> Children; for (DomTreeNode::iterator I = OldNode->begin(), E = OldNode->end(); I != E; ++I) Children.push_back(*I); - DomTreeNode *NewNode = DT->addNewBlock(New,Old); + DomTreeNode *NewNode = DT.addNewBlock(New, Old); for (std::vector<DomTreeNode *>::iterator I = Children.begin(), E = Children.end(); I != E; ++I) - DT->changeImmediateDominator(*I, NewNode); + DT.changeImmediateDominator(*I, NewNode); } } @@ -306,7 +310,7 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB, if (!P) return; LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>(); - Loop *L = LI ? LI->getLoopFor(OldBB) : 0; + Loop *L = LI ? LI->getLoopFor(OldBB) : nullptr; // If we need to preserve loop analyses, collect some information about how // this split will affect loops. @@ -336,9 +340,9 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB, } // Update dominator tree if available. - DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>(); - if (DT) - DT->splitBlock(NewBB); + if (DominatorTreeWrapperPass *DTWP = + P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) + DTWP->getDomTree().splitBlock(NewBB); if (!L) return; @@ -347,7 +351,7 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB, // loop). To find this, examine each of the predecessors and determine which // loops enclose them, and select the most-nested loop which contains the // loop containing the block being split. - Loop *InnermostPredLoop = 0; + Loop *InnermostPredLoop = nullptr; for (ArrayRef<BasicBlock*>::iterator i = Preds.begin(), e = Preds.end(); i != e; ++i) { BasicBlock *Pred = *i; @@ -380,51 +384,68 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB, ArrayRef<BasicBlock*> Preds, BranchInst *BI, Pass *P, bool HasLoopExit) { // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB. - AliasAnalysis *AA = P ? P->getAnalysisIfAvailable<AliasAnalysis>() : 0; + AliasAnalysis *AA = P ? P->getAnalysisIfAvailable<AliasAnalysis>() : nullptr; + SmallPtrSet<BasicBlock *, 16> PredSet(Preds.begin(), Preds.end()); for (BasicBlock::iterator I = OrigBB->begin(); isa<PHINode>(I); ) { PHINode *PN = cast<PHINode>(I++); // Check to see if all of the values coming in are the same. If so, we // don't need to create a new PHI node, unless it's needed for LCSSA. - Value *InVal = 0; + Value *InVal = nullptr; if (!HasLoopExit) { InVal = PN->getIncomingValueForBlock(Preds[0]); - for (unsigned i = 1, e = Preds.size(); i != e; ++i) - if (InVal != PN->getIncomingValueForBlock(Preds[i])) { - InVal = 0; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + if (!PredSet.count(PN->getIncomingBlock(i))) + continue; + if (!InVal) + InVal = PN->getIncomingValue(i); + else if (InVal != PN->getIncomingValue(i)) { + InVal = nullptr; break; } + } } if (InVal) { // If all incoming values for the new PHI would be the same, just don't // make a new PHI. Instead, just remove the incoming values from the old // PHI. - for (unsigned i = 0, e = Preds.size(); i != e; ++i) { - // Explicitly check the BB index here to handle duplicates in Preds. - int Idx = PN->getBasicBlockIndex(Preds[i]); - if (Idx >= 0) - PN->removeIncomingValue(Idx, false); - } - } else { - // If the values coming into the block are not the same, we need a PHI. - // Create the new PHI node, insert it into NewBB at the end of the block - PHINode *NewPHI = - PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI); - if (AA) AA->copyValue(PN, NewPHI); - // Move all of the PHI values for 'Preds' to the new PHI. - for (unsigned i = 0, e = Preds.size(); i != e; ++i) { - Value *V = PN->removeIncomingValue(Preds[i], false); - NewPHI->addIncoming(V, Preds[i]); - } + // NOTE! This loop walks backwards for a reason! First off, this minimizes + // the cost of removal if we end up removing a large number of values, and + // second off, this ensures that the indices for the incoming values + // aren't invalidated when we remove one. + for (int64_t i = PN->getNumIncomingValues() - 1; i >= 0; --i) + if (PredSet.count(PN->getIncomingBlock(i))) + PN->removeIncomingValue(i, false); + + // Add an incoming value to the PHI node in the loop for the preheader + // edge. + PN->addIncoming(InVal, NewBB); + continue; + } - InVal = NewPHI; + // If the values coming into the block are not the same, we need a new + // PHI. + // Create the new PHI node, insert it into NewBB at the end of the block + PHINode *NewPHI = + PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI); + if (AA) + AA->copyValue(PN, NewPHI); + + // NOTE! This loop walks backwards for a reason! First off, this minimizes + // the cost of removal if we end up removing a large number of values, and + // second off, this ensures that the indices for the incoming values aren't + // invalidated when we remove one. + for (int64_t i = PN->getNumIncomingValues() - 1; i >= 0; --i) { + BasicBlock *IncomingBB = PN->getIncomingBlock(i); + if (PredSet.count(IncomingBB)) { + Value *V = PN->removeIncomingValue(i, false); + NewPHI->addIncoming(V, IncomingBB); + } } - // Add an incoming value to the PHI node in the loop for the preheader - // edge. - PN->addIncoming(InVal, NewBB); + PN->addIncoming(NewPHI, NewBB); } } @@ -538,7 +559,7 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB, e = pred_end(OrigBB); } - BasicBlock *NewBB2 = 0; + BasicBlock *NewBB2 = nullptr; if (!NewBB2Preds.empty()) { // Create another basic block for the rest of OrigBB's predecessors. NewBB2 = BasicBlock::Create(OrigBB->getContext(), @@ -603,7 +624,7 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, for (User::op_iterator i = NewRet->op_begin(), e = NewRet->op_end(); i != e; ++i) { Value *V = *i; - Instruction *NewBC = 0; + Instruction *NewBC = nullptr; if (BitCastInst *BCI = dyn_cast<BitCastInst>(V)) { // Return value might be bitcasted. Clone and insert it before the // return instruction. @@ -630,28 +651,30 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, } /// SplitBlockAndInsertIfThen - Split the containing block at the -/// specified instruction - everything before and including Cmp stays -/// in the old basic block, and everything after Cmp is moved to a +/// specified instruction - everything before and including SplitBefore stays +/// in the old basic block, and everything after SplitBefore is moved to a /// new block. The two blocks are connected by a conditional branch /// (with value of Cmp being the condition). /// Before: /// Head -/// Cmp +/// SplitBefore /// Tail /// After: /// Head -/// Cmp -/// if (Cmp) +/// if (Cond) /// ThenBlock +/// SplitBefore /// Tail /// /// If Unreachable is true, then ThenBlock ends with /// UnreachableInst, otherwise it branches to Tail. /// Returns the NewBasicBlock's terminator. -TerminatorInst *llvm::SplitBlockAndInsertIfThen(Instruction *Cmp, - bool Unreachable, MDNode *BranchWeights) { - Instruction *SplitBefore = Cmp->getNextNode(); +TerminatorInst *llvm::SplitBlockAndInsertIfThen(Value *Cond, + Instruction *SplitBefore, + bool Unreachable, + MDNode *BranchWeights, + DominatorTree *DT) { BasicBlock *Head = SplitBefore->getParent(); BasicBlock *Tail = Head->splitBasicBlock(SplitBefore); TerminatorInst *HeadOldTerm = Head->getTerminator(); @@ -662,13 +685,65 @@ TerminatorInst *llvm::SplitBlockAndInsertIfThen(Instruction *Cmp, CheckTerm = new UnreachableInst(C, ThenBlock); else CheckTerm = BranchInst::Create(Tail, ThenBlock); + CheckTerm->setDebugLoc(SplitBefore->getDebugLoc()); BranchInst *HeadNewTerm = - BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cmp); + BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cond); + HeadNewTerm->setDebugLoc(SplitBefore->getDebugLoc()); HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights); ReplaceInstWithInst(HeadOldTerm, HeadNewTerm); + + if (DT) { + if (DomTreeNode *OldNode = DT->getNode(Head)) { + std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end()); + + DomTreeNode *NewNode = DT->addNewBlock(Tail, Head); + for (auto Child : Children) + DT->changeImmediateDominator(Child, NewNode); + + // Head dominates ThenBlock. + DT->addNewBlock(ThenBlock, Head); + } + } + return CheckTerm; } +/// SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, +/// but also creates the ElseBlock. +/// Before: +/// Head +/// SplitBefore +/// Tail +/// After: +/// Head +/// if (Cond) +/// ThenBlock +/// else +/// ElseBlock +/// SplitBefore +/// Tail +void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore, + TerminatorInst **ThenTerm, + TerminatorInst **ElseTerm, + MDNode *BranchWeights) { + BasicBlock *Head = SplitBefore->getParent(); + BasicBlock *Tail = Head->splitBasicBlock(SplitBefore); + TerminatorInst *HeadOldTerm = Head->getTerminator(); + LLVMContext &C = Head->getContext(); + BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); + BasicBlock *ElseBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); + *ThenTerm = BranchInst::Create(Tail, ThenBlock); + (*ThenTerm)->setDebugLoc(SplitBefore->getDebugLoc()); + *ElseTerm = BranchInst::Create(Tail, ElseBlock); + (*ElseTerm)->setDebugLoc(SplitBefore->getDebugLoc()); + BranchInst *HeadNewTerm = + BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/ElseBlock, Cond); + HeadNewTerm->setDebugLoc(SplitBefore->getDebugLoc()); + HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights); + ReplaceInstWithInst(HeadOldTerm, HeadNewTerm); +} + + /// GetIfCondition - Given a basic block (BB) with two predecessors, /// check to see if the merge at this block is due /// to an "if condition". If so, return the boolean condition that determines @@ -681,32 +756,32 @@ TerminatorInst *llvm::SplitBlockAndInsertIfThen(Instruction *Cmp, Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue, BasicBlock *&IfFalse) { PHINode *SomePHI = dyn_cast<PHINode>(BB->begin()); - BasicBlock *Pred1 = NULL; - BasicBlock *Pred2 = NULL; + BasicBlock *Pred1 = nullptr; + BasicBlock *Pred2 = nullptr; if (SomePHI) { if (SomePHI->getNumIncomingValues() != 2) - return NULL; + return nullptr; Pred1 = SomePHI->getIncomingBlock(0); Pred2 = SomePHI->getIncomingBlock(1); } else { pred_iterator PI = pred_begin(BB), PE = pred_end(BB); if (PI == PE) // No predecessor - return NULL; + return nullptr; Pred1 = *PI++; if (PI == PE) // Only one predecessor - return NULL; + return nullptr; Pred2 = *PI++; if (PI != PE) // More than two predecessors - return NULL; + return nullptr; } // We can only handle branches. Other control flow will be lowered to // branches if possible anyway. BranchInst *Pred1Br = dyn_cast<BranchInst>(Pred1->getTerminator()); BranchInst *Pred2Br = dyn_cast<BranchInst>(Pred2->getTerminator()); - if (Pred1Br == 0 || Pred2Br == 0) - return 0; + if (!Pred1Br || !Pred2Br) + return nullptr; // Eliminate code duplication by ensuring that Pred1Br is conditional if // either are. @@ -716,7 +791,7 @@ Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue, // required anyway, we stand no chance of eliminating it, so the xform is // probably not profitable. if (Pred1Br->isConditional()) - return 0; + return nullptr; std::swap(Pred1, Pred2); std::swap(Pred1Br, Pred2Br); @@ -726,8 +801,8 @@ Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue, // The only thing we have to watch out for here is to make sure that Pred2 // doesn't have incoming edges from other blocks. If it does, the condition // doesn't dominate BB. - if (Pred2->getSinglePredecessor() == 0) - return 0; + if (!Pred2->getSinglePredecessor()) + return nullptr; // If we found a conditional branch predecessor, make sure that it branches // to BB and Pred2Br. If it doesn't, this isn't an "if statement". @@ -742,7 +817,7 @@ Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue, } else { // We know that one arm of the conditional goes to BB, so the other must // go somewhere unrelated, and this must not be an "if statement". - return 0; + return nullptr; } return Pred1Br->getCondition(); @@ -752,12 +827,12 @@ Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue, // BB. Don't panic! If both blocks only have a single (identical) // predecessor, and THAT is a conditional branch, then we're all ok! BasicBlock *CommonPred = Pred1->getSinglePredecessor(); - if (CommonPred == 0 || CommonPred != Pred2->getSinglePredecessor()) - return 0; + if (CommonPred == nullptr || CommonPred != Pred2->getSinglePredecessor()) + return nullptr; // Otherwise, if this is a conditional branch, then we can use it! BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator()); - if (BI == 0) return 0; + if (!BI) return nullptr; assert(BI->isConditional() && "Two successors but not conditional?"); if (BI->getSuccessor(0) == Pred1) { diff --git a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index 0e7f7f7..80bd516 100644 --- a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -15,21 +15,22 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "break-crit-edges" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" -#include "llvm/Support/CFG.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; +#define DEBUG_TYPE "break-crit-edges" + STATISTIC(NumBroken, "Number of blocks inserted"); namespace { @@ -39,10 +40,10 @@ namespace { initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addPreserved<DominatorTree>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<LoopInfo>(); // No loop canonicalization guarantees are broken by this pass. @@ -141,7 +142,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, Pass *P, bool MergeIdenticalEdges, bool DontDeleteUselessPhis, bool SplitLandingPads) { - if (!isCriticalEdge(TI, SuccNum, MergeIdenticalEdges)) return 0; + if (!isCriticalEdge(TI, SuccNum, MergeIdenticalEdges)) return nullptr; assert(!isa<IndirectBrInst>(TI) && "Cannot split critical edge from IndirectBrInst"); @@ -151,7 +152,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, // Splitting the critical edge to a landing pad block is non-trivial. Don't do // it in this generic function. - if (DestBB->isLandingPad()) return 0; + if (DestBB->isLandingPad()) return nullptr; // Create a new basic block, linking it into the CFG. BasicBlock *NewBB = BasicBlock::Create(TI->getContext(), @@ -207,13 +208,15 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, // If we don't have a pass object, we can't update anything... - if (P == 0) return NewBB; + if (!P) return NewBB; - DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>(); + DominatorTreeWrapperPass *DTWP = + P->getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>(); // If we have nothing to update, just return. - if (DT == 0 && LI == 0) + if (!DT && !LI) return NewBB; // Now update analysis information. Since the only predecessor of NewBB is @@ -249,7 +252,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, // if (TINode) { // Don't break unreachable code! DomTreeNode *NewBBNode = DT->addNewBlock(NewBB, TIBB); - DomTreeNode *DestBBNode = 0; + DomTreeNode *DestBBNode = nullptr; // If NewBBDominatesDestBB hasn't been computed yet, do so with DT. if (!OtherPreds.empty()) { @@ -297,9 +300,8 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, P->addBasicBlockToLoop(NewBB, LI->getBase()); } } - // If TIBB is in a loop and DestBB is outside of that loop, split the - // other exit blocks of the loop that also have predecessors outside - // the loop, to maintain a LoopSimplify guarantee. + // If TIBB is in a loop and DestBB is outside of that loop, we may need + // to update LoopSimplify form and LCSSA form. if (!TIL->contains(DestBB) && P->mustPreserveAnalysisID(LoopSimplifyID)) { assert(!TIL->contains(NewBB) && @@ -309,50 +311,35 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, if (P->mustPreserveAnalysisID(LCSSAID)) createPHIsForSplitLoopExit(TIBB, NewBB, DestBB); - // For each unique exit block... - // FIXME: This code is functionally equivalent to the corresponding - // loop in LoopSimplify. - SmallVector<BasicBlock *, 4> ExitBlocks; - TIL->getExitBlocks(ExitBlocks); - for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { - // Collect all the preds that are inside the loop, and note - // whether there are any preds outside the loop. - SmallVector<BasicBlock *, 4> Preds; - bool HasPredOutsideOfLoop = false; - BasicBlock *Exit = ExitBlocks[i]; - for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); - I != E; ++I) { - BasicBlock *P = *I; - if (TIL->contains(P)) { - if (isa<IndirectBrInst>(P->getTerminator())) { - Preds.clear(); - break; - } - Preds.push_back(P); - } else { - HasPredOutsideOfLoop = true; - } - } - // If there are any preds not in the loop, we'll need to split - // the edges. The Preds.empty() check is needed because a block - // may appear multiple times in the list. We can't use - // getUniqueExitBlocks above because that depends on LoopSimplify - // form, which we're in the process of restoring! - if (!Preds.empty() && HasPredOutsideOfLoop) { - if (!Exit->isLandingPad()) { - BasicBlock *NewExitBB = - SplitBlockPredecessors(Exit, Preds, "split", P); - if (P->mustPreserveAnalysisID(LCSSAID)) - createPHIsForSplitLoopExit(Preds, NewExitBB, Exit); - } else if (SplitLandingPads) { - SmallVector<BasicBlock*, 8> NewBBs; - SplitLandingPadPredecessors(Exit, Preds, - ".split1", ".split2", - P, NewBBs); - if (P->mustPreserveAnalysisID(LCSSAID)) - createPHIsForSplitLoopExit(Preds, NewBBs[0], Exit); - } + // The only that we can break LoopSimplify form by splitting a critical + // edge is if after the split there exists some edge from TIL to DestBB + // *and* the only edge into DestBB from outside of TIL is that of + // NewBB. If the first isn't true, then LoopSimplify still holds, NewBB + // is the new exit block and it has no non-loop predecessors. If the + // second isn't true, then DestBB was not in LoopSimplify form prior to + // the split as it had a non-loop predecessor. In both of these cases, + // the predecessor must be directly in TIL, not in a subloop, or again + // LoopSimplify doesn't hold. + SmallVector<BasicBlock *, 4> LoopPreds; + for (pred_iterator I = pred_begin(DestBB), E = pred_end(DestBB); I != E; + ++I) { + BasicBlock *P = *I; + if (P == NewBB) + continue; // The new block is known. + if (LI->getLoopFor(P) != TIL) { + // No need to re-simplify, it wasn't to start with. + LoopPreds.clear(); + break; } + LoopPreds.push_back(P); + } + if (!LoopPreds.empty()) { + assert(!DestBB->isLandingPad() && + "We don't split edges to landing pads!"); + BasicBlock *NewExitBB = + SplitBlockPredecessors(DestBB, LoopPreds, "split", P); + if (P->mustPreserveAnalysisID(LCSSAID)) + createPHIsForSplitLoopExit(LoopPreds, NewExitBB, DestBB); } } // LCSSA form was updated above for the case where LoopSimplify is diff --git a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 6d13217..be00b695 100644 --- a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -27,7 +27,8 @@ using namespace llvm; /// CastToCStr - Return V if it is an i8*, otherwise cast it to i8*. Value *llvm::CastToCStr(Value *V, IRBuilder<> &B) { - return B.CreateBitCast(V, B.getInt8PtrTy(), "cstr"); + unsigned AS = V->getType()->getPointerAddressSpace(); + return B.CreateBitCast(V, B.getInt8PtrTy(AS), "cstr"); } /// EmitStrLen - Emit a call to the strlen function to the builder, for the @@ -35,7 +36,7 @@ Value *llvm::CastToCStr(Value *V, IRBuilder<> &B) { Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::strlen)) - return 0; + return nullptr; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeSet AS[2]; @@ -64,7 +65,7 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD, Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::strnlen)) - return 0; + return nullptr; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeSet AS[2]; @@ -94,7 +95,7 @@ Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B, Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::strchr)) - return 0; + return nullptr; Module *M = B.GetInsertBlock()->getParent()->getParent(); Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind }; @@ -120,7 +121,7 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::strncmp)) - return 0; + return nullptr; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeSet AS[3]; @@ -153,7 +154,7 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI, StringRef Name) { if (!TLI->has(LibFunc::strcpy)) - return 0; + return nullptr; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeSet AS[2]; @@ -177,7 +178,7 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI, StringRef Name) { if (!TLI->has(LibFunc::strncpy)) - return 0; + return nullptr; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeSet AS[2]; @@ -204,7 +205,7 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::memcpy_chk)) - return 0; + return nullptr; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeSet AS; @@ -232,7 +233,7 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::memchr)) - return 0; + return nullptr; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeSet AS; @@ -260,7 +261,7 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::memcmp)) - return 0; + return nullptr; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeSet AS[3]; @@ -286,6 +287,21 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2, return CI; } +/// Append a suffix to the function name according to the type of 'Op'. +static void AppendTypeSuffix(Value *Op, StringRef &Name, SmallString<20> &NameBuffer) { + if (!Op->getType()->isDoubleTy()) { + NameBuffer += Name; + + if (Op->getType()->isFloatTy()) + NameBuffer += 'f'; + else + NameBuffer += 'l'; + + Name = NameBuffer; + } + return; +} + /// EmitUnaryFloatFnCall - Emit a call to the unary function named 'Name' (e.g. /// 'floor'). This function is known to take a single of type matching 'Op' and /// returns one value with the same type. If 'Op' is a long double, 'l' is @@ -293,15 +309,7 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2, Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B, const AttributeSet &Attrs) { SmallString<20> NameBuffer; - if (!Op->getType()->isDoubleTy()) { - // If we need to add a suffix, copy into NameBuffer. - NameBuffer += Name; - if (Op->getType()->isFloatTy()) - NameBuffer += 'f'; // floorf - else - NameBuffer += 'l'; // floorl - Name = NameBuffer; - } + AppendTypeSuffix(Op, Name, NameBuffer); Module *M = B.GetInsertBlock()->getParent()->getParent(); Value *Callee = M->getOrInsertFunction(Name, Op->getType(), @@ -314,12 +322,33 @@ Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B, return CI; } +/// EmitBinaryFloatFnCall - Emit a call to the binary function named 'Name' +/// (e.g. 'fmin'). This function is known to take type matching 'Op1' and 'Op2' +/// and return one value with the same type. If 'Op1/Op2' are long double, 'l' +/// is added as the suffix of name, if 'Op1/Op2' is a float, we add a 'f' +/// suffix. +Value *llvm::EmitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name, + IRBuilder<> &B, const AttributeSet &Attrs) { + SmallString<20> NameBuffer; + AppendTypeSuffix(Op1, Name, NameBuffer); + + Module *M = B.GetInsertBlock()->getParent()->getParent(); + Value *Callee = M->getOrInsertFunction(Name, Op1->getType(), + Op1->getType(), Op2->getType(), NULL); + CallInst *CI = B.CreateCall2(Callee, Op1, Op2, Name); + CI->setAttributes(Attrs); + if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts())) + CI->setCallingConv(F->getCallingConv()); + + return CI; +} + /// EmitPutChar - Emit a call to the putchar function. This assumes that Char /// is an integer. Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::putchar)) - return 0; + return nullptr; Module *M = B.GetInsertBlock()->getParent()->getParent(); Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(), @@ -341,7 +370,7 @@ Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const DataLayout *TD, Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::puts)) - return 0; + return nullptr; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeSet AS[2]; @@ -365,7 +394,7 @@ Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const DataLayout *TD, Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::fputc)) - return 0; + return nullptr; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeSet AS[2]; @@ -398,7 +427,7 @@ Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B, Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::fputs)) - return 0; + return nullptr; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeSet AS[3]; @@ -431,7 +460,7 @@ Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B, const DataLayout *TD, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::fwrite)) - return 0; + return nullptr; Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeSet AS[3]; diff --git a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp index 1f517d0..f2d5e07 100644 --- a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp @@ -15,7 +15,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "bypass-slow-division" #include "llvm/Transforms/Utils/BypassSlowDivision.h" #include "llvm/ADT/DenseMap.h" #include "llvm/IR/Function.h" @@ -24,6 +23,8 @@ using namespace llvm; +#define DEBUG_TYPE "bypass-slow-division" + namespace { struct DivOpInfo { bool SignedOp; @@ -53,11 +54,11 @@ namespace llvm { } static DivOpInfo getEmptyKey() { - return DivOpInfo(false, 0, 0); + return DivOpInfo(false, nullptr, nullptr); } static DivOpInfo getTombstoneKey() { - return DivOpInfo(true, 0, 0); + return DivOpInfo(true, nullptr, nullptr); } static unsigned getHashValue(const DivOpInfo &Val) { diff --git a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp index d105f5e..5c8f20d 100644 --- a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -17,8 +17,9 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/DebugInfo.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" @@ -26,7 +27,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" -#include "llvm/Support/CFG.h" +#include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ValueMapper.h" @@ -88,26 +89,28 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, assert(VMap.count(I) && "No mapping from source argument specified!"); #endif + // Copy all attributes other than those stored in the AttributeSet. We need + // to remap the parameter indices of the AttributeSet. + AttributeSet NewAttrs = NewFunc->getAttributes(); + NewFunc->copyAttributesFrom(OldFunc); + NewFunc->setAttributes(NewAttrs); + AttributeSet OldAttrs = OldFunc->getAttributes(); // Clone any argument attributes that are present in the VMap. - for (Function::const_arg_iterator I = OldFunc->arg_begin(), - E = OldFunc->arg_end(); - I != E; ++I) - if (Argument *Anew = dyn_cast<Argument>(VMap[I])) { + for (const Argument &OldArg : OldFunc->args()) + if (Argument *NewArg = dyn_cast<Argument>(VMap[&OldArg])) { AttributeSet attrs = - OldAttrs.getParamAttributes(I->getArgNo() + 1); + OldAttrs.getParamAttributes(OldArg.getArgNo() + 1); if (attrs.getNumSlots() > 0) - Anew->addAttr(attrs); + NewArg->addAttr(attrs); } - NewFunc->setAttributes(NewFunc->getAttributes() - .addAttributes(NewFunc->getContext(), - AttributeSet::ReturnIndex, - OldAttrs.getRetAttributes())); - NewFunc->setAttributes(NewFunc->getAttributes() - .addAttributes(NewFunc->getContext(), - AttributeSet::FunctionIndex, - OldAttrs.getFnAttributes())); + NewFunc->setAttributes( + NewFunc->getAttributes() + .addAttributes(NewFunc->getContext(), AttributeSet::ReturnIndex, + OldAttrs.getRetAttributes()) + .addAttributes(NewFunc->getContext(), AttributeSet::FunctionIndex, + OldAttrs.getFnAttributes())); // Loop over all of the basic blocks in the function, cloning them as // appropriate. Note that we save BE this way in order to handle cloning of @@ -151,6 +154,54 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, TypeMapper, Materializer); } +// Find the MDNode which corresponds to the DISubprogram data that described F. +static MDNode* FindSubprogram(const Function *F, DebugInfoFinder &Finder) { + for (DISubprogram Subprogram : Finder.subprograms()) { + if (Subprogram.describes(F)) return Subprogram; + } + return nullptr; +} + +// Add an operand to an existing MDNode. The new operand will be added at the +// back of the operand list. +static void AddOperand(MDNode *Node, Value *Operand) { + SmallVector<Value*, 16> Operands; + for (unsigned i = 0; i < Node->getNumOperands(); i++) { + Operands.push_back(Node->getOperand(i)); + } + Operands.push_back(Operand); + MDNode *NewNode = MDNode::get(Node->getContext(), Operands); + Node->replaceAllUsesWith(NewNode); +} + +// Clone the module-level debug info associated with OldFunc. The cloned data +// will point to NewFunc instead. +static void CloneDebugInfoMetadata(Function *NewFunc, const Function *OldFunc, + ValueToValueMapTy &VMap) { + DebugInfoFinder Finder; + Finder.processModule(*OldFunc->getParent()); + + const MDNode *OldSubprogramMDNode = FindSubprogram(OldFunc, Finder); + if (!OldSubprogramMDNode) return; + + // Ensure that OldFunc appears in the map. + // (if it's already there it must point to NewFunc anyway) + VMap[OldFunc] = NewFunc; + DISubprogram NewSubprogram(MapValue(OldSubprogramMDNode, VMap)); + + for (DICompileUnit CU : Finder.compile_units()) { + DIArray Subprograms(CU.getSubprograms()); + + // If the compile unit's function list contains the old function, it should + // also contain the new one. + for (unsigned i = 0; i < Subprograms.getNumElements(); i++) { + if ((MDNode*)Subprograms.getElement(i) == OldSubprogramMDNode) { + AddOperand(Subprograms, NewSubprogram); + } + } + } +} + /// CloneFunction - Return a copy of the specified function, but without /// embedding the function into another module. Also, any references specified /// in the VMap are changed to refer to their mapped value instead of the @@ -188,6 +239,9 @@ Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap, VMap[I] = DestI++; // Add mapping to VMap } + if (ModuleLevelChanges) + CloneDebugInfoMetadata(NewF, F, VMap); + SmallVector<ReturnInst*, 8> Returns; // Ignore returns cloned. CloneFunctionInto(NewF, F, VMap, ModuleLevelChanges, Returns, "", CodeInfo); return NewF; @@ -205,17 +259,17 @@ namespace { bool ModuleLevelChanges; const char *NameSuffix; ClonedCodeInfo *CodeInfo; - const DataLayout *TD; + const DataLayout *DL; public: PruningFunctionCloner(Function *newFunc, const Function *oldFunc, ValueToValueMapTy &valueMap, bool moduleLevelChanges, const char *nameSuffix, ClonedCodeInfo *codeInfo, - const DataLayout *td) + const DataLayout *DL) : NewFunc(newFunc), OldFunc(oldFunc), VMap(valueMap), ModuleLevelChanges(moduleLevelChanges), - NameSuffix(nameSuffix), CodeInfo(codeInfo), TD(td) { + NameSuffix(nameSuffix), CodeInfo(codeInfo), DL(DL) { } /// CloneBlock - The specified block is found to be reachable, clone it and @@ -272,7 +326,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, // If we can simplify this instruction to some other value, simply add // a mapping to that value rather than inserting a new instruction into // the basic block. - if (Value *V = SimplifyInstruction(NewInst, TD)) { + if (Value *V = SimplifyInstruction(NewInst, DL)) { // On the off-chance that this simplifies to an instruction in the old // function, map it back into the new function. if (Value *MappedV = VMap.lookup(V)) @@ -305,7 +359,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, // If the condition was a known constant in the callee... ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition()); // Or is a known constant in the caller... - if (Cond == 0) { + if (!Cond) { Value *V = VMap[BI->getCondition()]; Cond = dyn_cast_or_null<ConstantInt>(V); } @@ -321,7 +375,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, } else if (const SwitchInst *SI = dyn_cast<SwitchInst>(OldTI)) { // If switching on a value known constant in the caller. ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition()); - if (Cond == 0) { // Or known constant after constant prop in the callee... + if (!Cond) { // Or known constant after constant prop in the callee... Value *V = VMap[SI->getCondition()]; Cond = dyn_cast_or_null<ConstantInt>(V); } @@ -368,7 +422,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, SmallVectorImpl<ReturnInst*> &Returns, const char *NameSuffix, ClonedCodeInfo *CodeInfo, - const DataLayout *TD, + const DataLayout *DL, Instruction *TheCall) { assert(NameSuffix && "NameSuffix cannot be null!"); @@ -379,7 +433,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, #endif PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges, - NameSuffix, CodeInfo, TD); + NameSuffix, CodeInfo, DL); // Clone the entry block, and anything recursively reachable from it. std::vector<const BasicBlock*> CloneWorklist; @@ -400,7 +454,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, BI != BE; ++BI) { Value *V = VMap[BI]; BasicBlock *NewBB = cast_or_null<BasicBlock>(V); - if (NewBB == 0) continue; // Dead block. + if (!NewBB) continue; // Dead block. // Add the new block to the new function. NewFunc->getBasicBlockList().push_back(NewBB); @@ -509,7 +563,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, // node). for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx) if (PHINode *PN = dyn_cast<PHINode>(VMap[PHIToResolve[Idx]])) - recursivelySimplifyInstruction(PN, TD); + recursivelySimplifyInstruction(PN, DL); // Now that the inlined function body has been fully constructed, go through // and zap unconditional fall-through branches. This happen all the time when diff --git a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp index 64df089..3f75b3e 100644 --- a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp @@ -47,8 +47,8 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { GlobalVariable *GV = new GlobalVariable(*New, I->getType()->getElementType(), I->isConstant(), I->getLinkage(), - (Constant*) 0, I->getName(), - (GlobalVariable*) 0, + (Constant*) nullptr, I->getName(), + (GlobalVariable*) nullptr, I->getThreadLocalMode(), I->getType()->getAddressSpace()); GV->copyAttributesFrom(I); @@ -67,8 +67,10 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { // Loop over the aliases in the module for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end(); I != E; ++I) { - GlobalAlias *GA = new GlobalAlias(I->getType(), I->getLinkage(), - I->getName(), NULL, New); + auto *PTy = cast<PointerType>(I->getType()); + auto *GA = + GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(), + I->getLinkage(), I->getName(), New); GA->copyAttributesFrom(I); VMap[I] = GA; } @@ -106,7 +108,7 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { I != E; ++I) { GlobalAlias *GA = cast<GlobalAlias>(VMap[I]); if (const Constant *C = I->getAliasee()) - GA->setAliasee(MapValue(C, VMap)); + GA->setAliasee(cast<GlobalObject>(MapValue(C, VMap))); } // And named metadata.... diff --git a/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp b/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp index 8fa412a..3b15a0a 100644 --- a/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp @@ -84,7 +84,7 @@ Value *llvm::getICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS, case 7: // True. return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1); } - return NULL; + return nullptr; } /// PredicatesFoldable - Return true if both predicates match sign or if at diff --git a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 6f008644..e70a7d6 100644 --- a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -14,20 +14,20 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/CodeExtractor.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" -#include "llvm/Analysis/Verifier.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Verifier.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -38,6 +38,8 @@ #include <set> using namespace llvm; +#define DEBUG_TYPE "code-extractor" + // Provide a command-line option to aggregate function arguments into a struct // for functions produced by the code extractor. This is useful when converting // extracted functions to pthread-based code, as only one argument (void*) can @@ -86,7 +88,7 @@ static SetVector<BasicBlock *> buildExtractionBlockSet(IteratorT BBBegin, } #ifndef NDEBUG - for (SetVector<BasicBlock *>::iterator I = llvm::next(Result.begin()), + for (SetVector<BasicBlock *>::iterator I = std::next(Result.begin()), E = Result.end(); I != E; ++I) for (pred_iterator PI = pred_begin(*I), PE = pred_end(*I); @@ -118,7 +120,7 @@ buildExtractionBlockSet(const RegionNode &RN) { } CodeExtractor::CodeExtractor(BasicBlock *BB, bool AggregateArgs) - : DT(0), AggregateArgs(AggregateArgs||AggregateArgsOpt), + : DT(nullptr), AggregateArgs(AggregateArgs||AggregateArgsOpt), Blocks(buildExtractionBlockSet(BB)), NumExitBlocks(~0U) {} CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT, @@ -171,9 +173,8 @@ void CodeExtractor::findInputsOutputs(ValueSet &Inputs, if (definedInCaller(Blocks, *OI)) Inputs.insert(*OI); - for (Value::use_iterator UI = II->use_begin(), UE = II->use_end(); - UI != UE; ++UI) - if (!definedInRegion(Blocks, *UI)) { + for (User *U : II->users()) + if (!definedInRegion(Blocks, U)) { Outputs.insert(II); break; } @@ -369,7 +370,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, } else RewriteVal = AI++; - std::vector<User*> Users(inputs[i]->use_begin(), inputs[i]->use_end()); + std::vector<User*> Users(inputs[i]->user_begin(), inputs[i]->user_end()); for (std::vector<User*>::iterator use = Users.begin(), useE = Users.end(); use != useE; ++use) if (Instruction* inst = dyn_cast<Instruction>(*use)) @@ -389,7 +390,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, // Rewrite branches to basic blocks outside of the loop to new dummy blocks // within the new function. This must be done before we lose track of which // blocks were originally in the code region. - std::vector<User*> Users(header->use_begin(), header->use_end()); + std::vector<User*> Users(header->user_begin(), header->user_end()); for (unsigned i = 0, e = Users.size(); i != e; ++i) // The BasicBlock which contains the branch is not in the region // modify the branch target to a new block @@ -405,14 +406,13 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, /// that uses the value within the basic block, and return the predecessor /// block associated with that use, or return 0 if none is found. static BasicBlock* FindPhiPredForUseInBlock(Value* Used, BasicBlock* BB) { - for (Value::use_iterator UI = Used->use_begin(), - UE = Used->use_end(); UI != UE; ++UI) { - PHINode *P = dyn_cast<PHINode>(*UI); + for (Use &U : Used->uses()) { + PHINode *P = dyn_cast<PHINode>(U.getUser()); if (P && P->getParent() == BB) - return P->getIncomingBlock(UI); + return P->getIncomingBlock(U); } - - return 0; + + return nullptr; } /// emitCallAndSwitchStatement - This method sets up the caller side by adding @@ -440,14 +440,14 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, StructValues.push_back(*i); } else { AllocaInst *alloca = - new AllocaInst((*i)->getType(), 0, (*i)->getName()+".loc", + new AllocaInst((*i)->getType(), nullptr, (*i)->getName()+".loc", codeReplacer->getParent()->begin()->begin()); ReloadOutputs.push_back(alloca); params.push_back(alloca); } } - AllocaInst *Struct = 0; + AllocaInst *Struct = nullptr; if (AggregateArgs && (inputs.size() + outputs.size() > 0)) { std::vector<Type*> ArgTypes; for (ValueSet::iterator v = StructValues.begin(), @@ -457,7 +457,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, // Allocate a struct at the beginning of this function Type *StructArgTy = StructType::get(newFunction->getContext(), ArgTypes); Struct = - new AllocaInst(StructArgTy, 0, "structArg", + new AllocaInst(StructArgTy, nullptr, "structArg", codeReplacer->getParent()->begin()->begin()); params.push_back(Struct); @@ -486,7 +486,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, // Reload the outputs passed in by reference for (unsigned i = 0, e = outputs.size(); i != e; ++i) { - Value *Output = 0; + Value *Output = nullptr; if (AggregateArgs) { Value *Idx[2]; Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); @@ -502,7 +502,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, LoadInst *load = new LoadInst(Output, outputs[i]->getName()+".reload"); Reloads.push_back(load); codeReplacer->getInstList().push_back(load); - std::vector<User*> Users(outputs[i]->use_begin(), outputs[i]->use_end()); + std::vector<User*> Users(outputs[i]->user_begin(), outputs[i]->user_end()); for (unsigned u = 0, e = Users.size(); u != e; ++u) { Instruction *inst = cast<Instruction>(Users[u]); if (!Blocks.count(inst->getParent())) @@ -539,7 +539,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, newFunction); unsigned SuccNum = switchVal++; - Value *brVal = 0; + Value *brVal = nullptr; switch (NumExitBlocks) { case 0: case 1: break; // No value needed. @@ -635,7 +635,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, // Check if the function should return a value if (OldFnRetTy->isVoidTy()) { - ReturnInst::Create(Context, 0, TheSwitch); // Return void + ReturnInst::Create(Context, nullptr, TheSwitch); // Return void } else if (OldFnRetTy == TheSwitch->getCondition()->getType()) { // return what we have ReturnInst::Create(Context, TheSwitch->getCondition(), TheSwitch); @@ -687,7 +687,7 @@ void CodeExtractor::moveCodeToFunction(Function *newFunction) { Function *CodeExtractor::extractCodeRegion() { if (!isEligible()) - return 0; + return nullptr; ValueSet inputs, outputs; diff --git a/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp new file mode 100644 index 0000000..a359424 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp @@ -0,0 +1,183 @@ +//===- CtorUtils.cpp - Helpers for working with global_ctors ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines functions that are used to process llvm.global_ctors. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/CtorUtils.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "ctor_utils" + +namespace llvm { + +namespace { +/// Given a specified llvm.global_ctors list, install the +/// specified array. +void installGlobalCtors(GlobalVariable *GCL, + const std::vector<Function *> &Ctors) { + // If we made a change, reassemble the initializer list. + Constant *CSVals[3]; + + StructType *StructTy = + cast<StructType>(GCL->getType()->getElementType()->getArrayElementType()); + + // Create the new init list. + std::vector<Constant *> CAList; + for (Function *F : Ctors) { + Type *Int32Ty = Type::getInt32Ty(GCL->getContext()); + if (F) { + CSVals[0] = ConstantInt::get(Int32Ty, 65535); + CSVals[1] = F; + } else { + CSVals[0] = ConstantInt::get(Int32Ty, 0x7fffffff); + CSVals[1] = Constant::getNullValue(StructTy->getElementType(1)); + } + // FIXME: Only allow the 3-field form in LLVM 4.0. + size_t NumElts = StructTy->getNumElements(); + if (NumElts > 2) + CSVals[2] = Constant::getNullValue(StructTy->getElementType(2)); + CAList.push_back( + ConstantStruct::get(StructTy, makeArrayRef(CSVals, NumElts))); + } + + // Create the array initializer. + Constant *CA = + ConstantArray::get(ArrayType::get(StructTy, CAList.size()), CAList); + + // If we didn't change the number of elements, don't create a new GV. + if (CA->getType() == GCL->getInitializer()->getType()) { + GCL->setInitializer(CA); + return; + } + + // Create the new global and insert it next to the existing list. + GlobalVariable *NGV = + new GlobalVariable(CA->getType(), GCL->isConstant(), GCL->getLinkage(), + CA, "", GCL->getThreadLocalMode()); + GCL->getParent()->getGlobalList().insert(GCL, NGV); + NGV->takeName(GCL); + + // Nuke the old list, replacing any uses with the new one. + if (!GCL->use_empty()) { + Constant *V = NGV; + if (V->getType() != GCL->getType()) + V = ConstantExpr::getBitCast(V, GCL->getType()); + GCL->replaceAllUsesWith(V); + } + GCL->eraseFromParent(); +} + +/// Given a llvm.global_ctors list that we can understand, +/// return a list of the functions and null terminator as a vector. +std::vector<Function*> parseGlobalCtors(GlobalVariable *GV) { + if (GV->getInitializer()->isNullValue()) + return std::vector<Function *>(); + ConstantArray *CA = cast<ConstantArray>(GV->getInitializer()); + std::vector<Function *> Result; + Result.reserve(CA->getNumOperands()); + for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) { + ConstantStruct *CS = cast<ConstantStruct>(*i); + Result.push_back(dyn_cast<Function>(CS->getOperand(1))); + } + return Result; +} + +/// Find the llvm.global_ctors list, verifying that all initializers have an +/// init priority of 65535. +GlobalVariable *findGlobalCtors(Module &M) { + GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors"); + if (!GV) + return nullptr; + + // Verify that the initializer is simple enough for us to handle. We are + // only allowed to optimize the initializer if it is unique. + if (!GV->hasUniqueInitializer()) + return nullptr; + + if (isa<ConstantAggregateZero>(GV->getInitializer())) + return GV; + ConstantArray *CA = cast<ConstantArray>(GV->getInitializer()); + + for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) { + if (isa<ConstantAggregateZero>(*i)) + continue; + ConstantStruct *CS = cast<ConstantStruct>(*i); + if (isa<ConstantPointerNull>(CS->getOperand(1))) + continue; + + // Must have a function or null ptr. + if (!isa<Function>(CS->getOperand(1))) + return nullptr; + + // Init priority must be standard. + ConstantInt *CI = cast<ConstantInt>(CS->getOperand(0)); + if (CI->getZExtValue() != 65535) + return nullptr; + } + + return GV; +} +} // namespace + +/// Call "ShouldRemove" for every entry in M's global_ctor list and remove the +/// entries for which it returns true. Return true if anything changed. +bool optimizeGlobalCtorsList(Module &M, + function_ref<bool(Function *)> ShouldRemove) { + GlobalVariable *GlobalCtors = findGlobalCtors(M); + if (!GlobalCtors) + return false; + + std::vector<Function *> Ctors = parseGlobalCtors(GlobalCtors); + if (Ctors.empty()) + return false; + + bool MadeChange = false; + + // Loop over global ctors, optimizing them when we can. + for (unsigned i = 0; i != Ctors.size(); ++i) { + Function *F = Ctors[i]; + // Found a null terminator in the middle of the list, prune off the rest of + // the list. + if (!F) { + if (i != Ctors.size() - 1) { + Ctors.resize(i + 1); + MadeChange = true; + } + break; + } + DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n"); + + // We cannot simplify external ctor functions. + if (F->empty()) + continue; + + // If we can evaluate the ctor at compile time, do. + if (ShouldRemove(F)) { + Ctors.erase(Ctors.begin() + i); + MadeChange = true; + --i; + continue; + } + } + + if (!MadeChange) + return false; + + installGlobalCtors(GlobalCtors, Ctors); + return true; +} + +} // End llvm namespace diff --git a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp index 0723b35..9972b22 100644 --- a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp +++ b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp @@ -8,12 +8,12 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Analysis/CFG.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; /// DemoteRegToStack - This function takes a virtual register computed by an @@ -25,23 +25,23 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads, Instruction *AllocaPoint) { if (I.use_empty()) { I.eraseFromParent(); - return 0; + return nullptr; } // Create a stack slot to hold the value. AllocaInst *Slot; if (AllocaPoint) { - Slot = new AllocaInst(I.getType(), 0, + Slot = new AllocaInst(I.getType(), nullptr, I.getName()+".reg2mem", AllocaPoint); } else { Function *F = I.getParent()->getParent(); - Slot = new AllocaInst(I.getType(), 0, I.getName()+".reg2mem", + Slot = new AllocaInst(I.getType(), nullptr, I.getName()+".reg2mem", F->getEntryBlock().begin()); } // Change all of the users of the instruction to read from the stack slot. while (!I.use_empty()) { - Instruction *U = cast<Instruction>(I.use_back()); + Instruction *U = cast<Instruction>(I.user_back()); if (PHINode *PN = dyn_cast<PHINode>(U)) { // If this is a PHI node, we can't insert a load of the value before the // use. Instead insert the load in the predecessor block corresponding @@ -56,7 +56,7 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads, for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) if (PN->getIncomingValue(i) == &I) { Value *&V = Loads[PN->getIncomingBlock(i)]; - if (V == 0) { + if (!V) { // Insert the load into the predecessor block V = new LoadInst(Slot, I.getName()+".reload", VolatileLoads, PN->getIncomingBlock(i)->getTerminator()); @@ -110,17 +110,17 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads, AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) { if (P->use_empty()) { P->eraseFromParent(); - return 0; + return nullptr; } // Create a stack slot to hold the value. AllocaInst *Slot; if (AllocaPoint) { - Slot = new AllocaInst(P->getType(), 0, + Slot = new AllocaInst(P->getType(), nullptr, P->getName()+".reg2mem", AllocaPoint); } else { Function *F = P->getParent()->getParent(); - Slot = new AllocaInst(P->getType(), 0, P->getName()+".reg2mem", + Slot = new AllocaInst(P->getType(), nullptr, P->getName()+".reg2mem", F->getEntryBlock().begin()); } diff --git a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp index 1da226b..51ead40 100644 --- a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp +++ b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp @@ -11,7 +11,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "flattencfg" #include "llvm/Transforms/Utils/Local.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -22,16 +21,19 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; +#define DEBUG_TYPE "flattencfg" + namespace { class FlattenCFGOpt { AliasAnalysis *AA; /// \brief Use parallel-and or parallel-or to generate conditions for /// conditional branches. - bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, Pass *P = 0); + bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, + Pass *P = nullptr); /// \brief If \param BB is the merge block of an if-region, attempt to merge /// the if-region with an adjacent if-region upstream if two if-regions /// contain identical instructions. - bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, Pass *P = 0); + bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, Pass *P = nullptr); /// \brief Compare a pair of blocks: \p Block1 and \p Block2, which /// are from two if-regions whose entry blocks are \p Head1 and \p /// Head2. \returns true if \p Block1 and \p Block2 contain identical @@ -126,9 +128,9 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, if (PHI) return false; // For simplicity, avoid cases containing PHI nodes. - BasicBlock *LastCondBlock = NULL; - BasicBlock *FirstCondBlock = NULL; - BasicBlock *UnCondBlock = NULL; + BasicBlock *LastCondBlock = nullptr; + BasicBlock *FirstCondBlock = nullptr; + BasicBlock *UnCondBlock = nullptr; int Idx = -1; // Check predecessors of \param BB. @@ -240,7 +242,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, BranchInst *BI = dyn_cast<BranchInst>(CurrBlock->getTerminator()); CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition()); CmpInst::Predicate Predicate = CI->getPredicate(); - // Cannonicalize icmp_ne -> icmp_eq, fcmp_one -> fcmp_oeq + // Canonicalize icmp_ne -> icmp_eq, fcmp_one -> fcmp_oeq if ((Predicate == CmpInst::ICMP_NE) || (Predicate == CmpInst::FCMP_ONE)) { CI->setPredicate(ICmpInst::getInversePredicate(Predicate)); BI->swapSuccessors(); diff --git a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp index 5f0a563..12057e4 100644 --- a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp +++ b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp @@ -9,9 +9,9 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/Support/CallSite.h" #include "llvm/Transforms/Utils/GlobalStatus.h" using namespace llvm; @@ -35,9 +35,8 @@ bool llvm::isSafeToDestroyConstant(const Constant *C) { if (isa<GlobalValue>(C)) return false; - for (Value::const_use_iterator UI = C->use_begin(), E = C->use_end(); UI != E; - ++UI) - if (const Constant *CU = dyn_cast<Constant>(*UI)) { + for (const User *U : C->users()) + if (const Constant *CU = dyn_cast<Constant>(U)) { if (!isSafeToDestroyConstant(CU)) return false; } else @@ -47,10 +46,9 @@ bool llvm::isSafeToDestroyConstant(const Constant *C) { static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS, SmallPtrSet<const PHINode *, 16> &PhiUsers) { - for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; - ++UI) { - const User *U = *UI; - if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) { + for (const Use &U : V->uses()) { + const User *UR = U.getUser(); + if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(UR)) { GS.HasNonInstructionUser = true; // If the result of the constantexpr isn't pointer type, then we won't @@ -60,10 +58,10 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS, if (analyzeGlobalAux(CE, GS, PhiUsers)) return true; - } else if (const Instruction *I = dyn_cast<Instruction>(U)) { + } else if (const Instruction *I = dyn_cast<Instruction>(UR)) { if (!GS.HasMultipleAccessingFunctions) { const Function *F = I->getParent()->getParent(); - if (GS.AccessingFunction == 0) + if (!GS.AccessingFunction) GS.AccessingFunction = F; else if (GS.AccessingFunction != F) GS.HasMultipleAccessingFunctions = true; @@ -150,13 +148,13 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS, return true; GS.StoredType = GlobalStatus::Stored; } else if (ImmutableCallSite C = I) { - if (!C.isCallee(UI)) + if (!C.isCallee(&U)) return true; GS.IsLoaded = true; } else { return true; // Any other non-load instruction might take address! } - } else if (const Constant *C = dyn_cast<Constant>(U)) { + } else if (const Constant *C = dyn_cast<Constant>(UR)) { GS.HasNonInstructionUser = true; // We might have a dead and dangling constant hanging off of here. if (!isSafeToDestroyConstant(C)) @@ -178,6 +176,6 @@ bool GlobalStatus::analyzeGlobal(const Value *V, GlobalStatus &GS) { GlobalStatus::GlobalStatus() : IsCompared(false), IsLoaded(false), StoredType(NotStored), - StoredOnceValue(0), AccessingFunction(0), + StoredOnceValue(nullptr), AccessingFunction(nullptr), HasMultipleAccessingFunctions(false), HasNonInstructionUser(false), Ordering(NotAtomic) {} diff --git a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp index d021bce..f0a9f2b 100644 --- a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -17,17 +17,18 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/DebugInfo.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" -#include "llvm/Support/CallSite.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -51,8 +52,8 @@ namespace { public: InvokeInliningInfo(InvokeInst *II) - : OuterResumeDest(II->getUnwindDest()), InnerResumeDest(0), - CallerLPad(0), InnerEHValuesPHI(0) { + : OuterResumeDest(II->getUnwindDest()), InnerResumeDest(nullptr), + CallerLPad(nullptr), InnerEHValuesPHI(nullptr) { // If there are PHI nodes in the unwind destination block, we need to keep // track of which values came into them from the invoke before removing // the edge from this block. @@ -144,7 +145,6 @@ BasicBlock *InvokeInliningInfo::getInnerResumeDest() { void InvokeInliningInfo::forwardResume(ResumeInst *RI, SmallPtrSet<LandingPadInst*, 16> &InlinedLPads) { BasicBlock *Dest = getInnerResumeDest(); - LandingPadInst *OuterLPad = getLandingPadInst(); BasicBlock *Src = RI->getParent(); BranchInst::Create(Dest, Src); @@ -155,16 +155,6 @@ void InvokeInliningInfo::forwardResume(ResumeInst *RI, InnerEHValuesPHI->addIncoming(RI->getOperand(0), Src); RI->eraseFromParent(); - - // Append the clauses from the outer landing pad instruction into the inlined - // landing pad instructions. - for (SmallPtrSet<LandingPadInst*, 16>::iterator I = InlinedLPads.begin(), - E = InlinedLPads.end(); I != E; ++I) { - LandingPadInst *InlinedLPad = *I; - for (unsigned OuterIdx = 0, OuterNum = OuterLPad->getNumClauses(); - OuterIdx != OuterNum; ++OuterIdx) - InlinedLPad->addClause(OuterLPad->getClause(OuterIdx)); - } } /// HandleCallsInBlockInlinedThroughInvoke - When we inline a basic block into @@ -172,22 +162,11 @@ void InvokeInliningInfo::forwardResume(ResumeInst *RI, /// invokes. This function analyze BB to see if there are any calls, and if so, /// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI /// nodes in that block with the values specified in InvokeDestPHIValues. -/// -/// Returns true to indicate that the next block should be skipped. -static bool HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, +static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, InvokeInliningInfo &Invoke) { - LandingPadInst *LPI = Invoke.getLandingPadInst(); - for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { Instruction *I = BBI++; - if (LandingPadInst *L = dyn_cast<LandingPadInst>(I)) { - unsigned NumClauses = LPI->getNumClauses(); - L->reserveClauses(NumClauses); - for (unsigned i = 0; i != NumClauses; ++i) - L->addClause(LPI->getClause(i)); - } - // We only need to check for function calls: inlined invoke // instructions require no special handling. CallInst *CI = dyn_cast<CallInst>(I); @@ -210,6 +189,7 @@ static bool HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, InvokeInst *II = InvokeInst::Create(CI->getCalledValue(), Split, Invoke.getOuterResumeDest(), InvokeArgs, CI->getName(), BB); + II->setDebugLoc(CI->getDebugLoc()); II->setCallingConv(CI->getCallingConv()); II->setAttributes(CI->getAttributes()); @@ -223,10 +203,8 @@ static bool HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, // Update any PHI nodes in the exceptional block to indicate that there is // now a new entry in them. Invoke.addIncomingPHIValuesFor(BB); - return false; + return; } - - return false; } /// HandleInlinedInvoke - If we inlined an invoke site, we need to convert calls @@ -252,13 +230,23 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock, if (InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator())) InlinedLPads.insert(II->getLandingPadInst()); + // Append the clauses from the outer landing pad instruction into the inlined + // landing pad instructions. + LandingPadInst *OuterLPad = Invoke.getLandingPadInst(); + for (SmallPtrSet<LandingPadInst*, 16>::iterator I = InlinedLPads.begin(), + E = InlinedLPads.end(); I != E; ++I) { + LandingPadInst *InlinedLPad = *I; + unsigned OuterNum = OuterLPad->getNumClauses(); + InlinedLPad->reserveClauses(OuterNum); + for (unsigned OuterIdx = 0; OuterIdx != OuterNum; ++OuterIdx) + InlinedLPad->addClause(OuterLPad->getClause(OuterIdx)); + if (OuterLPad->isCleanup()) + InlinedLPad->setCleanup(true); + } + for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; ++BB){ if (InlinedCodeInfo.ContainsCalls) - if (HandleCallsInBlockInlinedThroughInvoke(BB, Invoke)) { - // Honor a request to skip the next block. - ++BB; - continue; - } + HandleCallsInBlockInlinedThroughInvoke(BB, Invoke); // Forward any resumes that are remaining here. if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) @@ -303,13 +291,13 @@ static void UpdateCallGraphAfterInlining(CallSite CS, ValueToValueMapTy::iterator VMI = VMap.find(OrigCall); // Only copy the edge if the call was inlined! - if (VMI == VMap.end() || VMI->second == 0) + if (VMI == VMap.end() || VMI->second == nullptr) continue; // If the call was inlined, but then constant folded, there is no edge to // add. Check for this case. Instruction *NewCall = dyn_cast<Instruction>(VMI->second); - if (NewCall == 0) continue; + if (!NewCall) continue; // Remember that this call site got inlined for the client of // InlineFunction. @@ -320,7 +308,7 @@ static void UpdateCallGraphAfterInlining(CallSite CS, // happens, set the callee of the new call site to a more precise // destination. This can also happen if the call graph node of the caller // was just unnecessarily imprecise. - if (I->second->getFunction() == 0) + if (!I->second->getFunction()) if (Function *F = CallSite(NewCall).getCalledFunction()) { // Indirect call site resolved to direct call. CallerNode->addCalledFunction(CallSite(NewCall), CG[F]); @@ -336,13 +324,44 @@ static void UpdateCallGraphAfterInlining(CallSite CS, CallerNode->removeCallEdgeFor(CS); } +static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M, + BasicBlock *InsertBlock, + InlineFunctionInfo &IFI) { + LLVMContext &Context = Src->getContext(); + Type *VoidPtrTy = Type::getInt8PtrTy(Context); + Type *AggTy = cast<PointerType>(Src->getType())->getElementType(); + Type *Tys[3] = { VoidPtrTy, VoidPtrTy, Type::getInt64Ty(Context) }; + Function *MemCpyFn = Intrinsic::getDeclaration(M, Intrinsic::memcpy, Tys); + IRBuilder<> builder(InsertBlock->begin()); + Value *DstCast = builder.CreateBitCast(Dst, VoidPtrTy, "tmp"); + Value *SrcCast = builder.CreateBitCast(Src, VoidPtrTy, "tmp"); + + Value *Size; + if (IFI.DL == nullptr) + Size = ConstantExpr::getSizeOf(AggTy); + else + Size = ConstantInt::get(Type::getInt64Ty(Context), + IFI.DL->getTypeStoreSize(AggTy)); + + // Always generate a memcpy of alignment 1 here because we don't know + // the alignment of the src pointer. Other optimizations can infer + // better alignment. + Value *CallArgs[] = { + DstCast, SrcCast, Size, + ConstantInt::get(Type::getInt32Ty(Context), 1), + ConstantInt::getFalse(Context) // isVolatile + }; + builder.CreateCall(MemCpyFn, CallArgs); +} + /// HandleByValArgument - When inlining a call site that has a byval argument, /// we have to make the implicit memcpy explicit by adding it. static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, const Function *CalledFunc, InlineFunctionInfo &IFI, unsigned ByValAlignment) { - Type *AggTy = cast<PointerType>(Arg->getType())->getElementType(); + PointerType *ArgTy = cast<PointerType>(Arg->getType()); + Type *AggTy = ArgTy->getElementType(); // If the called function is readonly, then it could not mutate the caller's // copy of the byval'd memory. In this case, it is safe to elide the copy and @@ -357,21 +376,17 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, // If the pointer is already known to be sufficiently aligned, or if we can // round it up to a larger alignment, then we don't need a temporary. if (getOrEnforceKnownAlignment(Arg, ByValAlignment, - IFI.TD) >= ByValAlignment) + IFI.DL) >= ByValAlignment) return Arg; // Otherwise, we have to make a memcpy to get a safe alignment. This is bad // for code quality, but rarely happens and is required for correctness. } - - LLVMContext &Context = Arg->getContext(); - Type *VoidPtrTy = Type::getInt8PtrTy(Context); - // Create the alloca. If we have DataLayout, use nice alignment. unsigned Align = 1; - if (IFI.TD) - Align = IFI.TD->getPrefTypeAlignment(AggTy); + if (IFI.DL) + Align = IFI.DL->getPrefTypeAlignment(AggTy); // If the byval had an alignment specified, we *must* use at least that // alignment, as it is required by the byval argument (and uses of the @@ -380,32 +395,9 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, Function *Caller = TheCall->getParent()->getParent(); - Value *NewAlloca = new AllocaInst(AggTy, 0, Align, Arg->getName(), + Value *NewAlloca = new AllocaInst(AggTy, nullptr, Align, Arg->getName(), &*Caller->begin()->begin()); - // Emit a memcpy. - Type *Tys[3] = {VoidPtrTy, VoidPtrTy, Type::getInt64Ty(Context)}; - Function *MemCpyFn = Intrinsic::getDeclaration(Caller->getParent(), - Intrinsic::memcpy, - Tys); - Value *DestCast = new BitCastInst(NewAlloca, VoidPtrTy, "tmp", TheCall); - Value *SrcCast = new BitCastInst(Arg, VoidPtrTy, "tmp", TheCall); - - Value *Size; - if (IFI.TD == 0) - Size = ConstantExpr::getSizeOf(AggTy); - else - Size = ConstantInt::get(Type::getInt64Ty(Context), - IFI.TD->getTypeStoreSize(AggTy)); - - // Always generate a memcpy of alignment 1 here because we don't know - // the alignment of the src pointer. Other optimizations can infer - // better alignment. - Value *CallArgs[] = { - DestCast, SrcCast, Size, - ConstantInt::get(Type::getInt32Ty(Context), 1), - ConstantInt::getFalse(Context) // isVolatile - }; - IRBuilder<>(TheCall).CreateCall(MemCpyFn, CallArgs); + IFI.StaticAllocas.push_back(cast<AllocaInst>(NewAlloca)); // Uses of the argument in the function should use our new alloca // instead. @@ -415,9 +407,8 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, // isUsedByLifetimeMarker - Check whether this Value is used by a lifetime // intrinsic. static bool isUsedByLifetimeMarker(Value *V) { - for (Value::use_iterator UI = V->use_begin(), UE = V->use_end(); UI != UE; - ++UI) { - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(*UI)) { + for (User *U : V->users()) { + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) { switch (II->getIntrinsicID()) { default: break; case Intrinsic::lifetime_start: @@ -432,16 +423,17 @@ static bool isUsedByLifetimeMarker(Value *V) { // hasLifetimeMarkers - Check whether the given alloca already has // lifetime.start or lifetime.end intrinsics. static bool hasLifetimeMarkers(AllocaInst *AI) { - Type *Int8PtrTy = Type::getInt8PtrTy(AI->getType()->getContext()); - if (AI->getType() == Int8PtrTy) + Type *Ty = AI->getType(); + Type *Int8PtrTy = Type::getInt8PtrTy(Ty->getContext(), + Ty->getPointerAddressSpace()); + if (Ty == Int8PtrTy) return isUsedByLifetimeMarker(AI); // Do a scan to find all the casts to i8*. - for (Value::use_iterator I = AI->use_begin(), E = AI->use_end(); I != E; - ++I) { - if (I->getType() != Int8PtrTy) continue; - if (I->stripPointerCasts() != AI) continue; - if (isUsedByLifetimeMarker(*I)) + for (User *U : AI->users()) { + if (U->getType() != Int8PtrTy) continue; + if (U->stripPointerCasts() != AI) continue; + if (isUsedByLifetimeMarker(U)) return true; } return false; @@ -475,7 +467,13 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) { DebugLoc DL = BI->getDebugLoc(); - if (!DL.isUnknown()) { + if (DL.isUnknown()) { + // If the inlined instruction has no line number, make it look as if it + // originates from the call location. This is important for + // ((__always_inline__, __nodebug__)) functions which must use caller + // location for all instructions in their function body. + BI->setDebugLoc(TheCallDL); + } else { BI->setDebugLoc(updateInlinedAtInfo(DL, TheCallDL, BI->getContext())); if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(BI)) { LLVMContext &Ctx = BI->getContext(); @@ -488,6 +486,33 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, } } +/// Returns a musttail call instruction if one immediately precedes the given +/// return instruction with an optional bitcast instruction between them. +static CallInst *getPrecedingMustTailCall(ReturnInst *RI) { + Instruction *Prev = RI->getPrevNode(); + if (!Prev) + return nullptr; + + if (Value *RV = RI->getReturnValue()) { + if (RV != Prev) + return nullptr; + + // Look through the optional bitcast. + if (auto *BI = dyn_cast<BitCastInst>(Prev)) { + RV = BI->getOperand(0); + Prev = BI->getPrevNode(); + if (!Prev || RV != Prev) + return nullptr; + } + } + + if (auto *CI = dyn_cast<CallInst>(Prev)) { + if (CI->isMustTailCall()) + return CI; + } + return nullptr; +} + /// InlineFunction - This function inlines the called function into the basic /// block of the caller. This returns false if it is not possible to inline /// this call. The program is still in a well defined state if this occurs @@ -507,15 +532,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, IFI.reset(); const Function *CalledFunc = CS.getCalledFunction(); - if (CalledFunc == 0 || // Can't inline external function or indirect + if (!CalledFunc || // Can't inline external function or indirect CalledFunc->isDeclaration() || // call, or call to a vararg function! CalledFunc->getFunctionType()->isVarArg()) return false; - // If the call to the callee is not a tail call, we must clear the 'tail' - // flags on any calls that we inline. - bool MustClearTailCallFlags = - !(isa<CallInst>(TheCall) && cast<CallInst>(TheCall)->isTailCall()); - // If the call to the callee cannot throw, set the 'nounwind' flag on any // calls that we inline. bool MarkNoUnwind = CS.doesNotThrow(); @@ -535,7 +555,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, } // Get the personality function from the callee if it contains a landing pad. - Value *CalleePersonality = 0; + Value *CalleePersonality = nullptr; for (Function::const_iterator I = CalledFunc->begin(), E = CalledFunc->end(); I != E; ++I) if (const InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator())) { @@ -578,6 +598,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, { // Scope to destroy VMap after cloning. ValueToValueMapTy VMap; + // Keep a list of pair (dst, src) to emit byval initializations. + SmallVector<std::pair<Value*, Value*>, 4> ByValInit; assert(CalledFunc->arg_size() == CS.arg_size() && "No varargs calls can be inlined!"); @@ -597,11 +619,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, if (CS.isByValArgument(ArgNo)) { ActualArg = HandleByValArgument(ActualArg, TheCall, CalledFunc, IFI, CalledFunc->getParamAlignment(ArgNo+1)); - - // Calls that we inline may use the new alloca, so we need to clear - // their 'tail' flags if HandleByValArgument introduced a new alloca and - // the callee has calls. - MustClearTailCallFlags |= ActualArg != *AI; + if (ActualArg != *AI) + ByValInit.push_back(std::make_pair(ActualArg, (Value*) *AI)); } VMap[I] = ActualArg; @@ -613,11 +632,16 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // happy with whatever the cloner can do. CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, /*ModuleLevelChanges=*/false, Returns, ".i", - &InlinedFunctionInfo, IFI.TD, TheCall); + &InlinedFunctionInfo, IFI.DL, TheCall); // Remember the first block that is newly cloned over. FirstNewBlock = LastBlock; ++FirstNewBlock; + // Inject byval arguments initialization. + for (std::pair<Value*, Value*> &Init : ByValInit) + HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(), + FirstNewBlock, IFI); + // Update the callgraph if requested. if (IFI.CG) UpdateCallGraphAfterInlining(CS, FirstNewBlock, VMap, IFI); @@ -635,7 +659,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, for (BasicBlock::iterator I = FirstNewBlock->begin(), E = FirstNewBlock->end(); I != E; ) { AllocaInst *AI = dyn_cast<AllocaInst>(I++); - if (AI == 0) continue; + if (!AI) continue; // If the alloca is now dead, remove it. This often occurs due to code // specialization. @@ -667,6 +691,45 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, } } + bool InlinedMustTailCalls = false; + if (InlinedFunctionInfo.ContainsCalls) { + CallInst::TailCallKind CallSiteTailKind = CallInst::TCK_None; + if (CallInst *CI = dyn_cast<CallInst>(TheCall)) + CallSiteTailKind = CI->getTailCallKind(); + + for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; + ++BB) { + for (Instruction &I : *BB) { + CallInst *CI = dyn_cast<CallInst>(&I); + if (!CI) + continue; + + // We need to reduce the strength of any inlined tail calls. For + // musttail, we have to avoid introducing potential unbounded stack + // growth. For example, if functions 'f' and 'g' are mutually recursive + // with musttail, we can inline 'g' into 'f' so long as we preserve + // musttail on the cloned call to 'f'. If either the inlined call site + // or the cloned call site is *not* musttail, the program already has + // one frame of stack growth, so it's safe to remove musttail. Here is + // a table of example transformations: + // + // f -> musttail g -> musttail f ==> f -> musttail f + // f -> musttail g -> tail f ==> f -> tail f + // f -> g -> musttail f ==> f -> f + // f -> g -> tail f ==> f -> f + CallInst::TailCallKind ChildTCK = CI->getTailCallKind(); + ChildTCK = std::min(CallSiteTailKind, ChildTCK); + CI->setTailCallKind(ChildTCK); + InlinedMustTailCalls |= CI->isMustTailCall(); + + // Calls inlined through a 'nounwind' call site should be marked + // 'nounwind'. + if (MarkNoUnwind) + CI->setDoesNotThrow(); + } + } + } + // Leave lifetime markers for the static alloca's, scoping them to the // function we just inlined. if (InsertLifetime && !IFI.StaticAllocas.empty()) { @@ -680,12 +743,12 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, continue; // Try to determine the size of the allocation. - ConstantInt *AllocaSize = 0; + ConstantInt *AllocaSize = nullptr; if (ConstantInt *AIArraySize = dyn_cast<ConstantInt>(AI->getArraySize())) { - if (IFI.TD) { + if (IFI.DL) { Type *AllocaType = AI->getAllocatedType(); - uint64_t AllocaTypeSize = IFI.TD->getTypeAllocSize(AllocaType); + uint64_t AllocaTypeSize = IFI.DL->getTypeAllocSize(AllocaType); uint64_t AllocaArraySize = AIArraySize->getLimitedValue(); assert(AllocaArraySize > 0 && "array size of AllocaInst is zero"); // Check that array size doesn't saturate uint64_t and doesn't @@ -699,9 +762,12 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, } builder.CreateLifetimeStart(AI, AllocaSize); - for (unsigned ri = 0, re = Returns.size(); ri != re; ++ri) { - IRBuilder<> builder(Returns[ri]); - builder.CreateLifetimeEnd(AI, AllocaSize); + for (ReturnInst *RI : Returns) { + // Don't insert llvm.lifetime.end calls between a musttail call and a + // return. The return kills all local allocas. + if (InlinedMustTailCalls && getPrecedingMustTailCall(RI)) + continue; + IRBuilder<>(RI).CreateLifetimeEnd(AI, AllocaSize); } } } @@ -720,33 +786,56 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Insert a call to llvm.stackrestore before any return instructions in the // inlined function. - for (unsigned i = 0, e = Returns.size(); i != e; ++i) { - IRBuilder<>(Returns[i]).CreateCall(StackRestore, SavedPtr); + for (ReturnInst *RI : Returns) { + // Don't insert llvm.stackrestore calls between a musttail call and a + // return. The return will restore the stack pointer. + if (InlinedMustTailCalls && getPrecedingMustTailCall(RI)) + continue; + IRBuilder<>(RI).CreateCall(StackRestore, SavedPtr); } } - // If we are inlining tail call instruction through a call site that isn't - // marked 'tail', we must remove the tail marker for any calls in the inlined - // code. Also, calls inlined through a 'nounwind' call site should be marked - // 'nounwind'. - if (InlinedFunctionInfo.ContainsCalls && - (MustClearTailCallFlags || MarkNoUnwind)) { - for (Function::iterator BB = FirstNewBlock, E = Caller->end(); - BB != E; ++BB) - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) - if (CallInst *CI = dyn_cast<CallInst>(I)) { - if (MustClearTailCallFlags) - CI->setTailCall(false); - if (MarkNoUnwind) - CI->setDoesNotThrow(); - } - } - // If we are inlining for an invoke instruction, we must make sure to rewrite // any call instructions into invoke instructions. if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) HandleInlinedInvoke(II, FirstNewBlock, InlinedFunctionInfo); + // Handle any inlined musttail call sites. In order for a new call site to be + // musttail, the source of the clone and the inlined call site must have been + // musttail. Therefore it's safe to return without merging control into the + // phi below. + if (InlinedMustTailCalls) { + // Check if we need to bitcast the result of any musttail calls. + Type *NewRetTy = Caller->getReturnType(); + bool NeedBitCast = !TheCall->use_empty() && TheCall->getType() != NewRetTy; + + // Handle the returns preceded by musttail calls separately. + SmallVector<ReturnInst *, 8> NormalReturns; + for (ReturnInst *RI : Returns) { + CallInst *ReturnedMustTail = getPrecedingMustTailCall(RI); + if (!ReturnedMustTail) { + NormalReturns.push_back(RI); + continue; + } + if (!NeedBitCast) + continue; + + // Delete the old return and any preceding bitcast. + BasicBlock *CurBB = RI->getParent(); + auto *OldCast = dyn_cast_or_null<BitCastInst>(RI->getReturnValue()); + RI->eraseFromParent(); + if (OldCast) + OldCast->eraseFromParent(); + + // Insert a new bitcast and return with the right type. + IRBuilder<> Builder(CurBB); + Builder.CreateRet(Builder.CreateBitCast(ReturnedMustTail, NewRetTy)); + } + + // Leave behind the normal returns so we can merge control flow. + std::swap(Returns, NormalReturns); + } + // If we cloned in _exactly one_ basic block, and if that block ends in a // return instruction, we splice the body of the inlined callee directly into // the calling basic block. @@ -790,7 +879,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // "starter" and "ender" blocks. How we accomplish this depends on whether // this is an invoke instruction or a call instruction. BasicBlock *AfterCallBB; - BranchInst *CreatedBranchToNormalDest = NULL; + BranchInst *CreatedBranchToNormalDest = nullptr; if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) { // Add an unconditional branch to make this look like the CallInst case... @@ -829,7 +918,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // any users of the original call/invoke instruction. Type *RTy = CalledFunc->getReturnType(); - PHINode *PHI = 0; + PHINode *PHI = nullptr; if (Returns.size() > 1) { // The PHI node should go at the front of the new basic block to merge all // possible incoming values. @@ -902,6 +991,11 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Since we are now done with the Call/Invoke, we can delete it. TheCall->eraseFromParent(); + // If we inlined any musttail calls and the original return is now + // unreachable, delete it. It can only contain a bitcast and ret. + if (InlinedMustTailCalls && pred_begin(AfterCallBB) == pred_end(AfterCallBB)) + AfterCallBB->eraseFromParent(); + // We should always be able to fold the entry block of the function into the // single predecessor of the block... assert(cast<BranchInst>(Br)->isUnconditional() && "splitBasicBlock broken!"); @@ -922,7 +1016,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // the entries are the same or undef). If so, remove the PHI so it doesn't // block other optimizations. if (PHI) { - if (Value *V = SimplifyInstruction(PHI, IFI.TD)) { + if (Value *V = SimplifyInstruction(PHI, IFI.DL)) { PHI->replaceAllUsesWith(V); PHI->eraseFromParent(); } diff --git a/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp b/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp index a020bc7..da890a2 100644 --- a/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp +++ b/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp @@ -27,11 +27,11 @@ namespace { initializeInstNamerPass(*PassRegistry::getPassRegistry()); } - void getAnalysisUsage(AnalysisUsage &Info) const { + void getAnalysisUsage(AnalysisUsage &Info) const override { Info.setPreservesAll(); } - bool runOnFunction(Function &F) { + bool runOnFunction(Function &F) override { for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI) if (!AI->hasName() && !AI->getType()->isVoidTy()) diff --git a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp index 3cb8ded..9f91eeb 100644 --- a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp +++ b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp @@ -7,22 +7,24 @@ // //===----------------------------------------------------------------------===// // -// This file contains an implementation of 32bit scalar integer division for -// targets that don't have native support. It's largely derived from -// compiler-rt's implementation of __udivsi3, but hand-tuned to reduce the -// amount of control flow +// This file contains an implementation of 32bit and 64bit scalar integer +// division for targets that don't have native support. It's largely derived +// from compiler-rt's implementations of __udivsi3 and __udivmoddi4, +// but hand-tuned for targets that prefer less control flow. // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "integer-division" #include "llvm/Transforms/Utils/IntegerDivision.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" +#include <utility> using namespace llvm; +#define DEBUG_TYPE "integer-division" + /// Generate code to compute the remainder of two signed integers. Returns the /// remainder, which will have the sign of the dividend. Builder's insert point /// should be pointing where the caller wants code generated, e.g. at the srem @@ -31,7 +33,18 @@ using namespace llvm; /// be expanded if the user wishes static Value *generateSignedRemainderCode(Value *Dividend, Value *Divisor, IRBuilder<> &Builder) { - ConstantInt *ThirtyOne = Builder.getInt32(31); + unsigned BitWidth = Dividend->getType()->getIntegerBitWidth(); + ConstantInt *Shift; + + if (BitWidth == 64) { + Shift = Builder.getInt64(63); + } else { + assert(BitWidth == 32 && "Unexpected bit width"); + Shift = Builder.getInt32(31); + } + + // Following instructions are generated for both i32 (shift 31) and + // i64 (shift 63). // ; %dividend_sgn = ashr i32 %dividend, 31 // ; %divisor_sgn = ashr i32 %divisor, 31 @@ -42,8 +55,8 @@ static Value *generateSignedRemainderCode(Value *Dividend, Value *Divisor, // ; %urem = urem i32 %dividend, %divisor // ; %xored = xor i32 %urem, %dividend_sgn // ; %srem = sub i32 %xored, %dividend_sgn - Value *DividendSign = Builder.CreateAShr(Dividend, ThirtyOne); - Value *DivisorSign = Builder.CreateAShr(Divisor, ThirtyOne); + Value *DividendSign = Builder.CreateAShr(Dividend, Shift); + Value *DivisorSign = Builder.CreateAShr(Divisor, Shift); Value *DvdXor = Builder.CreateXor(Dividend, DividendSign); Value *DvsXor = Builder.CreateXor(Divisor, DivisorSign); Value *UDividend = Builder.CreateSub(DvdXor, DividendSign); @@ -68,6 +81,8 @@ static Value *generatedUnsignedRemainderCode(Value *Dividend, Value *Divisor, IRBuilder<> &Builder) { // Remainder = Dividend - Quotient*Divisor + // Following instructions are generated for both i32 and i64 + // ; %quotient = udiv i32 %dividend, %divisor // ; %product = mul i32 %divisor, %quotient // ; %remainder = sub i32 %dividend, %product @@ -88,9 +103,20 @@ static Value *generatedUnsignedRemainderCode(Value *Dividend, Value *Divisor, /// present, i.e. not folded), ready to be expanded if the user wishes. static Value *generateSignedDivisionCode(Value *Dividend, Value *Divisor, IRBuilder<> &Builder) { - // Implementation taken from compiler-rt's __divsi3 + // Implementation taken from compiler-rt's __divsi3 and __divdi3 + + unsigned BitWidth = Dividend->getType()->getIntegerBitWidth(); + ConstantInt *Shift; + + if (BitWidth == 64) { + Shift = Builder.getInt64(63); + } else { + assert(BitWidth == 32 && "Unexpected bit width"); + Shift = Builder.getInt32(31); + } - ConstantInt *ThirtyOne = Builder.getInt32(31); + // Following instructions are generated for both i32 (shift 31) and + // i64 (shift 63). // ; %tmp = ashr i32 %dividend, 31 // ; %tmp1 = ashr i32 %divisor, 31 @@ -102,8 +128,8 @@ static Value *generateSignedDivisionCode(Value *Dividend, Value *Divisor, // ; %q_mag = udiv i32 %u_dvnd, %u_dvsr // ; %tmp4 = xor i32 %q_mag, %q_sgn // ; %q = sub i32 %tmp4, %q_sgn - Value *Tmp = Builder.CreateAShr(Dividend, ThirtyOne); - Value *Tmp1 = Builder.CreateAShr(Divisor, ThirtyOne); + Value *Tmp = Builder.CreateAShr(Dividend, Shift); + Value *Tmp1 = Builder.CreateAShr(Divisor, Shift); Value *Tmp2 = Builder.CreateXor(Tmp, Dividend); Value *U_Dvnd = Builder.CreateSub(Tmp2, Tmp); Value *Tmp3 = Builder.CreateXor(Tmp1, Divisor); @@ -119,9 +145,9 @@ static Value *generateSignedDivisionCode(Value *Dividend, Value *Divisor, return Q; } -/// Generates code to divide two unsigned scalar 32-bit integers. Returns the -/// quotient, rounded towards 0. Builder's insert point should be pointing where -/// the caller wants code generated, e.g. at the udiv instruction. +/// Generates code to divide two unsigned scalar 32-bit or 64-bit integers. +/// Returns the quotient, rounded towards 0. Builder's insert point should +/// point where the caller wants code generated, e.g. at the udiv instruction. static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor, IRBuilder<> &Builder) { // The basic algorithm can be found in the compiler-rt project's @@ -129,18 +155,33 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor, // that's been hand-tuned to lessen the amount of control flow involved. // Some helper values - IntegerType *I32Ty = Builder.getInt32Ty(); + IntegerType *DivTy = cast<IntegerType>(Dividend->getType()); + unsigned BitWidth = DivTy->getBitWidth(); + + ConstantInt *Zero; + ConstantInt *One; + ConstantInt *NegOne; + ConstantInt *MSB; + + if (BitWidth == 64) { + Zero = Builder.getInt64(0); + One = Builder.getInt64(1); + NegOne = ConstantInt::getSigned(DivTy, -1); + MSB = Builder.getInt64(63); + } else { + assert(BitWidth == 32 && "Unexpected bit width"); + Zero = Builder.getInt32(0); + One = Builder.getInt32(1); + NegOne = ConstantInt::getSigned(DivTy, -1); + MSB = Builder.getInt32(31); + } - ConstantInt *Zero = Builder.getInt32(0); - ConstantInt *One = Builder.getInt32(1); - ConstantInt *ThirtyOne = Builder.getInt32(31); - ConstantInt *NegOne = ConstantInt::getSigned(I32Ty, -1); - ConstantInt *True = Builder.getTrue(); + ConstantInt *True = Builder.getTrue(); BasicBlock *IBB = Builder.GetInsertBlock(); Function *F = IBB->getParent(); - Function *CTLZi32 = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, - I32Ty); + Function *CTLZ = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, + DivTy); // Our CFG is going to look like: // +---------------------+ @@ -190,6 +231,8 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor, // We'll be overwriting the terminator to insert our extra blocks SpecialCases->getTerminator()->eraseFromParent(); + // Same instructions are generated for both i32 (msb 31) and i64 (msb 63). + // First off, check for special cases: dividend or divisor is zero, divisor // is greater than dividend, and divisor is 1. // ; special-cases: @@ -209,12 +252,12 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor, Value *Ret0_1 = Builder.CreateICmpEQ(Divisor, Zero); Value *Ret0_2 = Builder.CreateICmpEQ(Dividend, Zero); Value *Ret0_3 = Builder.CreateOr(Ret0_1, Ret0_2); - Value *Tmp0 = Builder.CreateCall2(CTLZi32, Divisor, True); - Value *Tmp1 = Builder.CreateCall2(CTLZi32, Dividend, True); + Value *Tmp0 = Builder.CreateCall2(CTLZ, Divisor, True); + Value *Tmp1 = Builder.CreateCall2(CTLZ, Dividend, True); Value *SR = Builder.CreateSub(Tmp0, Tmp1); - Value *Ret0_4 = Builder.CreateICmpUGT(SR, ThirtyOne); + Value *Ret0_4 = Builder.CreateICmpUGT(SR, MSB); Value *Ret0 = Builder.CreateOr(Ret0_3, Ret0_4); - Value *RetDividend = Builder.CreateICmpEQ(SR, ThirtyOne); + Value *RetDividend = Builder.CreateICmpEQ(SR, MSB); Value *RetVal = Builder.CreateSelect(Ret0, Zero, Dividend); Value *EarlyRet = Builder.CreateOr(Ret0, RetDividend); Builder.CreateCondBr(EarlyRet, End, BB1); @@ -227,7 +270,7 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor, // ; br i1 %skipLoop, label %loop-exit, label %preheader Builder.SetInsertPoint(BB1); Value *SR_1 = Builder.CreateAdd(SR, One); - Value *Tmp2 = Builder.CreateSub(ThirtyOne, SR); + Value *Tmp2 = Builder.CreateSub(MSB, SR); Value *Q = Builder.CreateShl(Dividend, Tmp2); Value *SkipLoop = Builder.CreateICmpEQ(SR_1, Zero); Builder.CreateCondBr(SkipLoop, LoopExit, Preheader); @@ -260,17 +303,17 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor, // ; %tmp12 = icmp eq i32 %sr_2, 0 // ; br i1 %tmp12, label %loop-exit, label %do-while Builder.SetInsertPoint(DoWhile); - PHINode *Carry_1 = Builder.CreatePHI(I32Ty, 2); - PHINode *SR_3 = Builder.CreatePHI(I32Ty, 2); - PHINode *R_1 = Builder.CreatePHI(I32Ty, 2); - PHINode *Q_2 = Builder.CreatePHI(I32Ty, 2); + PHINode *Carry_1 = Builder.CreatePHI(DivTy, 2); + PHINode *SR_3 = Builder.CreatePHI(DivTy, 2); + PHINode *R_1 = Builder.CreatePHI(DivTy, 2); + PHINode *Q_2 = Builder.CreatePHI(DivTy, 2); Value *Tmp5 = Builder.CreateShl(R_1, One); - Value *Tmp6 = Builder.CreateLShr(Q_2, ThirtyOne); + Value *Tmp6 = Builder.CreateLShr(Q_2, MSB); Value *Tmp7 = Builder.CreateOr(Tmp5, Tmp6); Value *Tmp8 = Builder.CreateShl(Q_2, One); Value *Q_1 = Builder.CreateOr(Carry_1, Tmp8); Value *Tmp9 = Builder.CreateSub(Tmp4, Tmp7); - Value *Tmp10 = Builder.CreateAShr(Tmp9, 31); + Value *Tmp10 = Builder.CreateAShr(Tmp9, MSB); Value *Carry = Builder.CreateAnd(Tmp10, One); Value *Tmp11 = Builder.CreateAnd(Tmp10, Divisor); Value *R = Builder.CreateSub(Tmp7, Tmp11); @@ -285,8 +328,8 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor, // ; %q_4 = or i32 %carry_2, %tmp13 // ; br label %end Builder.SetInsertPoint(LoopExit); - PHINode *Carry_2 = Builder.CreatePHI(I32Ty, 2); - PHINode *Q_3 = Builder.CreatePHI(I32Ty, 2); + PHINode *Carry_2 = Builder.CreatePHI(DivTy, 2); + PHINode *Q_3 = Builder.CreatePHI(DivTy, 2); Value *Tmp13 = Builder.CreateShl(Q_3, One); Value *Q_4 = Builder.CreateOr(Carry_2, Tmp13); Builder.CreateBr(End); @@ -295,7 +338,7 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor, // ; %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ] // ; ret i32 %q_5 Builder.SetInsertPoint(End, End->begin()); - PHINode *Q_5 = Builder.CreatePHI(I32Ty, 2); + PHINode *Q_5 = Builder.CreatePHI(DivTy, 2); // Populate the Phis, since all values have now been created. Our Phis were: // ; %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ] @@ -326,9 +369,8 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor, /// Generate code to calculate the remainder of two integers, replacing Rem with /// the generated code. This currently generates code using the udiv expansion, /// but future work includes generating more specialized code, e.g. when more -/// information about the operands are known. Currently only implements 32bit -/// scalar division (due to udiv's limitation), but future work is removing this -/// limitation. +/// information about the operands are known. Implements both 32bit and 64bit +/// scalar division. /// /// @brief Replace Rem with generated code. bool llvm::expandRemainder(BinaryOperator *Rem) { @@ -338,6 +380,15 @@ bool llvm::expandRemainder(BinaryOperator *Rem) { IRBuilder<> Builder(Rem); + Type *RemTy = Rem->getType(); + if (RemTy->isVectorTy()) + llvm_unreachable("Div over vectors not supported"); + + unsigned RemTyBitWidth = RemTy->getIntegerBitWidth(); + + if (RemTyBitWidth != 32 && RemTyBitWidth != 64) + llvm_unreachable("Div of bitwidth other than 32 or 64 not supported"); + // First prepare the sign if it's a signed remainder if (Rem->getOpcode() == Instruction::SRem) { Value *Remainder = generateSignedRemainderCode(Rem->getOperand(0), @@ -376,9 +427,8 @@ bool llvm::expandRemainder(BinaryOperator *Rem) { /// Generate code to divide two integers, replacing Div with the generated /// code. This currently generates code similarly to compiler-rt's /// implementations, but future work includes generating more specialized code -/// when more information about the operands are known. Currently only -/// implements 32bit scalar division, but future work is removing this -/// limitation. +/// when more information about the operands are known. Implements both +/// 32bit and 64bit scalar division. /// /// @brief Replace Div with generated code. bool llvm::expandDivision(BinaryOperator *Div) { @@ -388,9 +438,15 @@ bool llvm::expandDivision(BinaryOperator *Div) { IRBuilder<> Builder(Div); - if (Div->getType()->isVectorTy()) + Type *DivTy = Div->getType(); + if (DivTy->isVectorTy()) llvm_unreachable("Div over vectors not supported"); + unsigned DivTyBitWidth = DivTy->getIntegerBitWidth(); + + if (DivTyBitWidth != 32 && DivTyBitWidth != 64) + llvm_unreachable("Div of bitwidth other than 32 or 64 not supported"); + // First prepare the sign if it's a signed division if (Div->getOpcode() == Instruction::SDiv) { // Lower the code to unsigned division, and reset Div to point to the udiv. @@ -443,7 +499,7 @@ bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) { if (RemTyBitWidth == 32) return expandRemainder(Rem); - // If bitwidth smaller than 32 extend inputs, truncate output and proceed + // If bitwidth smaller than 32 extend inputs, extend output and proceed // with 32 bit division. IRBuilder<> Builder(Rem); @@ -471,6 +527,55 @@ bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) { return expandRemainder(cast<BinaryOperator>(ExtRem)); } +/// Generate code to compute the remainder of two integers of bitwidth up to +/// 64 bits. Uses the above routines and extends the inputs/truncates the +/// outputs to operate in 64 bits. +/// +/// @brief Replace Rem with emulation code. +bool llvm::expandRemainderUpTo64Bits(BinaryOperator *Rem) { + assert((Rem->getOpcode() == Instruction::SRem || + Rem->getOpcode() == Instruction::URem) && + "Trying to expand remainder from a non-remainder function"); + + Type *RemTy = Rem->getType(); + if (RemTy->isVectorTy()) + llvm_unreachable("Div over vectors not supported"); + + unsigned RemTyBitWidth = RemTy->getIntegerBitWidth(); + + if (RemTyBitWidth > 64) + llvm_unreachable("Div of bitwidth greater than 64 not supported"); + + if (RemTyBitWidth == 64) + return expandRemainder(Rem); + + // If bitwidth smaller than 64 extend inputs, extend output and proceed + // with 64 bit division. + IRBuilder<> Builder(Rem); + + Value *ExtDividend; + Value *ExtDivisor; + Value *ExtRem; + Value *Trunc; + Type *Int64Ty = Builder.getInt64Ty(); + + if (Rem->getOpcode() == Instruction::SRem) { + ExtDividend = Builder.CreateSExt(Rem->getOperand(0), Int64Ty); + ExtDivisor = Builder.CreateSExt(Rem->getOperand(1), Int64Ty); + ExtRem = Builder.CreateSRem(ExtDividend, ExtDivisor); + } else { + ExtDividend = Builder.CreateZExt(Rem->getOperand(0), Int64Ty); + ExtDivisor = Builder.CreateZExt(Rem->getOperand(1), Int64Ty); + ExtRem = Builder.CreateURem(ExtDividend, ExtDivisor); + } + Trunc = Builder.CreateTrunc(ExtRem, RemTy); + + Rem->replaceAllUsesWith(Trunc); + Rem->dropAllReferences(); + Rem->eraseFromParent(); + + return expandRemainder(cast<BinaryOperator>(ExtRem)); +} /// Generate code to divide two integers of bitwidth up to 32 bits. Uses the /// above routines and extends the inputs/truncates the outputs to operate @@ -495,7 +600,7 @@ bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) { if (DivTyBitWidth == 32) return expandDivision(Div); - // If bitwidth smaller than 32 extend inputs, truncate output and proceed + // If bitwidth smaller than 32 extend inputs, extend output and proceed // with 32 bit division. IRBuilder<> Builder(Div); @@ -522,3 +627,53 @@ bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) { return expandDivision(cast<BinaryOperator>(ExtDiv)); } + +/// Generate code to divide two integers of bitwidth up to 64 bits. Uses the +/// above routines and extends the inputs/truncates the outputs to operate +/// in 64 bits. +/// +/// @brief Replace Div with emulation code. +bool llvm::expandDivisionUpTo64Bits(BinaryOperator *Div) { + assert((Div->getOpcode() == Instruction::SDiv || + Div->getOpcode() == Instruction::UDiv) && + "Trying to expand division from a non-division function"); + + Type *DivTy = Div->getType(); + if (DivTy->isVectorTy()) + llvm_unreachable("Div over vectors not supported"); + + unsigned DivTyBitWidth = DivTy->getIntegerBitWidth(); + + if (DivTyBitWidth > 64) + llvm_unreachable("Div of bitwidth greater than 64 not supported"); + + if (DivTyBitWidth == 64) + return expandDivision(Div); + + // If bitwidth smaller than 64 extend inputs, extend output and proceed + // with 64 bit division. + IRBuilder<> Builder(Div); + + Value *ExtDividend; + Value *ExtDivisor; + Value *ExtDiv; + Value *Trunc; + Type *Int64Ty = Builder.getInt64Ty(); + + if (Div->getOpcode() == Instruction::SDiv) { + ExtDividend = Builder.CreateSExt(Div->getOperand(0), Int64Ty); + ExtDivisor = Builder.CreateSExt(Div->getOperand(1), Int64Ty); + ExtDiv = Builder.CreateSDiv(ExtDividend, ExtDivisor); + } else { + ExtDividend = Builder.CreateZExt(Div->getOperand(0), Int64Ty); + ExtDivisor = Builder.CreateZExt(Div->getOperand(1), Int64Ty); + ExtDiv = Builder.CreateUDiv(ExtDividend, ExtDivisor); + } + Trunc = Builder.CreateTrunc(ExtDiv, DivTy); + + Div->replaceAllUsesWith(Trunc); + Div->dropAllReferences(); + Div->eraseFromParent(); + + return expandDivision(cast<BinaryOperator>(ExtDiv)); +} diff --git a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp index 97e7e5d..51a3d9c 100644 --- a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp @@ -27,222 +27,108 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "lcssa" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/PredIteratorCache.h" #include "llvm/Pass.h" -#include "llvm/Support/PredIteratorCache.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; -STATISTIC(NumLCSSA, "Number of live out of a loop variables"); - -namespace { - struct LCSSA : public LoopPass { - static char ID; // Pass identification, replacement for typeid - LCSSA() : LoopPass(ID) { - initializeLCSSAPass(*PassRegistry::getPassRegistry()); - } - - // Cached analysis information for the current function. - DominatorTree *DT; - LoopInfo *LI; - ScalarEvolution *SE; - PredIteratorCache PredCache; - Loop *L; - - virtual bool runOnLoop(Loop *L, LPPassManager &LPM); - - /// This transformation requires natural loop information & requires that - /// loop preheaders be inserted into the CFG. It maintains both of these, - /// as well as the CFG. It also requires dominator information. - /// - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesCFG(); - - AU.addRequired<DominatorTree>(); - AU.addRequired<LoopInfo>(); - AU.addPreservedID(LoopSimplifyID); - AU.addPreserved<AliasAnalysis>(); - AU.addPreserved<ScalarEvolution>(); - } - private: - bool ProcessInstruction(Instruction *Inst, - const SmallVectorImpl<BasicBlock*> &ExitBlocks); - - /// verifyAnalysis() - Verify loop nest. - virtual void verifyAnalysis() const { - // Check the special guarantees that LCSSA makes. - assert(L->isLCSSAForm(*DT) && "LCSSA form not preserved!"); - } - }; -} - -char LCSSA::ID = 0; -INITIALIZE_PASS_BEGIN(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) -INITIALIZE_PASS_END(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false) - -Pass *llvm::createLCSSAPass() { return new LCSSA(); } -char &llvm::LCSSAID = LCSSA::ID; +#define DEBUG_TYPE "lcssa" +STATISTIC(NumLCSSA, "Number of live out of a loop variables"); -/// BlockDominatesAnExit - Return true if the specified block dominates at least -/// one of the blocks in the specified list. -static bool BlockDominatesAnExit(BasicBlock *BB, - const SmallVectorImpl<BasicBlock*> &ExitBlocks, - DominatorTree *DT) { - DomTreeNode *DomNode = DT->getNode(BB); +/// Return true if the specified block is in the list. +static bool isExitBlock(BasicBlock *BB, + const SmallVectorImpl<BasicBlock *> &ExitBlocks) { for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) - if (DT->dominates(DomNode, DT->getNode(ExitBlocks[i]))) + if (ExitBlocks[i] == BB) return true; - return false; } +/// Given an instruction in the loop, check to see if it has any uses that are +/// outside the current loop. If so, insert LCSSA PHI nodes and rewrite the +/// uses. +static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, + const SmallVectorImpl<BasicBlock *> &ExitBlocks, + PredIteratorCache &PredCache) { + SmallVector<Use *, 16> UsesToRewrite; -/// runOnFunction - Process all loops in the function, inner-most out. -bool LCSSA::runOnLoop(Loop *TheLoop, LPPassManager &LPM) { - L = TheLoop; - - DT = &getAnalysis<DominatorTree>(); - LI = &getAnalysis<LoopInfo>(); - SE = getAnalysisIfAvailable<ScalarEvolution>(); - - // Get the set of exiting blocks. - SmallVector<BasicBlock*, 8> ExitBlocks; - L->getExitBlocks(ExitBlocks); - - if (ExitBlocks.empty()) - return false; - - // Look at all the instructions in the loop, checking to see if they have uses - // outside the loop. If so, rewrite those uses. - bool MadeChange = false; - - for (Loop::block_iterator BBI = L->block_begin(), E = L->block_end(); - BBI != E; ++BBI) { - BasicBlock *BB = *BBI; - - // For large loops, avoid use-scanning by using dominance information: In - // particular, if a block does not dominate any of the loop exits, then none - // of the values defined in the block could be used outside the loop. - if (!BlockDominatesAnExit(BB, ExitBlocks, DT)) - continue; - - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); - I != E; ++I) { - // Reject two common cases fast: instructions with no uses (like stores) - // and instructions with one use that is in the same block as this. - if (I->use_empty() || - (I->hasOneUse() && I->use_back()->getParent() == BB && - !isa<PHINode>(I->use_back()))) - continue; - - MadeChange |= ProcessInstruction(I, ExitBlocks); - } - } - - // If we modified the code, remove any caches about the loop from SCEV to - // avoid dangling entries. - // FIXME: This is a big hammer, can we clear the cache more selectively? - if (SE && MadeChange) - SE->forgetLoop(L); - - assert(L->isLCSSAForm(*DT)); - PredCache.clear(); - - return MadeChange; -} + BasicBlock *InstBB = Inst.getParent(); -/// isExitBlock - Return true if the specified block is in the list. -static bool isExitBlock(BasicBlock *BB, - const SmallVectorImpl<BasicBlock*> &ExitBlocks) { - for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) - if (ExitBlocks[i] == BB) - return true; - return false; -} + for (Use &U : Inst.uses()) { + Instruction *User = cast<Instruction>(U.getUser()); + BasicBlock *UserBB = User->getParent(); + if (PHINode *PN = dyn_cast<PHINode>(User)) + UserBB = PN->getIncomingBlock(U); -/// ProcessInstruction - Given an instruction in the loop, check to see if it -/// has any uses that are outside the current loop. If so, insert LCSSA PHI -/// nodes and rewrite the uses. -bool LCSSA::ProcessInstruction(Instruction *Inst, - const SmallVectorImpl<BasicBlock*> &ExitBlocks) { - SmallVector<Use*, 16> UsesToRewrite; - - BasicBlock *InstBB = Inst->getParent(); - - for (Value::use_iterator UI = Inst->use_begin(), E = Inst->use_end(); - UI != E; ++UI) { - User *U = *UI; - BasicBlock *UserBB = cast<Instruction>(U)->getParent(); - if (PHINode *PN = dyn_cast<PHINode>(U)) - UserBB = PN->getIncomingBlock(UI); - - if (InstBB != UserBB && !L->contains(UserBB)) - UsesToRewrite.push_back(&UI.getUse()); + if (InstBB != UserBB && !L.contains(UserBB)) + UsesToRewrite.push_back(&U); } // If there are no uses outside the loop, exit with no change. - if (UsesToRewrite.empty()) return false; - + if (UsesToRewrite.empty()) + return false; + ++NumLCSSA; // We are applying the transformation // Invoke instructions are special in that their result value is not available - // along their unwind edge. The code below tests to see whether DomBB dominates + // along their unwind edge. The code below tests to see whether DomBB + // dominates // the value, so adjust DomBB to the normal destination block, which is // effectively where the value is first usable. - BasicBlock *DomBB = Inst->getParent(); - if (InvokeInst *Inv = dyn_cast<InvokeInst>(Inst)) + BasicBlock *DomBB = Inst.getParent(); + if (InvokeInst *Inv = dyn_cast<InvokeInst>(&Inst)) DomBB = Inv->getNormalDest(); - DomTreeNode *DomNode = DT->getNode(DomBB); + DomTreeNode *DomNode = DT.getNode(DomBB); - SmallVector<PHINode*, 16> AddedPHIs; + SmallVector<PHINode *, 16> AddedPHIs; SSAUpdater SSAUpdate; - SSAUpdate.Initialize(Inst->getType(), Inst->getName()); - + SSAUpdate.Initialize(Inst.getType(), Inst.getName()); + // Insert the LCSSA phi's into all of the exit blocks dominated by the // value, and add them to the Phi's map. - for (SmallVectorImpl<BasicBlock*>::const_iterator BBI = ExitBlocks.begin(), - BBE = ExitBlocks.end(); BBI != BBE; ++BBI) { + for (SmallVectorImpl<BasicBlock *>::const_iterator BBI = ExitBlocks.begin(), + BBE = ExitBlocks.end(); + BBI != BBE; ++BBI) { BasicBlock *ExitBB = *BBI; - if (!DT->dominates(DomNode, DT->getNode(ExitBB))) continue; - + if (!DT.dominates(DomNode, DT.getNode(ExitBB))) + continue; + // If we already inserted something for this BB, don't reprocess it. - if (SSAUpdate.HasValueForBlock(ExitBB)) continue; - - PHINode *PN = PHINode::Create(Inst->getType(), - PredCache.GetNumPreds(ExitBB), - Inst->getName()+".lcssa", - ExitBB->begin()); + if (SSAUpdate.HasValueForBlock(ExitBB)) + continue; + + PHINode *PN = PHINode::Create(Inst.getType(), PredCache.GetNumPreds(ExitBB), + Inst.getName() + ".lcssa", ExitBB->begin()); // Add inputs from inside the loop for this PHI. for (BasicBlock **PI = PredCache.GetPreds(ExitBB); *PI; ++PI) { - PN->addIncoming(Inst, *PI); + PN->addIncoming(&Inst, *PI); // If the exit block has a predecessor not within the loop, arrange for // the incoming value use corresponding to that predecessor to be // rewritten in terms of a different LCSSA PHI. - if (!L->contains(*PI)) + if (!L.contains(*PI)) UsesToRewrite.push_back( - &PN->getOperandUse( - PN->getOperandNumForIncomingValue(PN->getNumIncomingValues()-1))); + &PN->getOperandUse(PN->getOperandNumForIncomingValue( + PN->getNumIncomingValues() - 1))); } AddedPHIs.push_back(PN); - + // Remember that this phi makes the value alive in this block. SSAUpdate.AddAvailableValue(ExitBB, PN); } @@ -259,15 +145,14 @@ bool LCSSA::ProcessInstruction(Instruction *Inst, if (PHINode *PN = dyn_cast<PHINode>(User)) UserBB = PN->getIncomingBlock(*UsesToRewrite[i]); - if (isa<PHINode>(UserBB->begin()) && - isExitBlock(UserBB, ExitBlocks)) { + if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) { // Tell the VHs that the uses changed. This updates SCEV's caches. if (UsesToRewrite[i]->get()->hasValueHandle()) ValueHandleBase::ValueIsRAUWd(*UsesToRewrite[i], UserBB->begin()); UsesToRewrite[i]->set(UserBB->begin()); continue; } - + // Otherwise, do full PHI insertion. SSAUpdate.RewriteUse(*UsesToRewrite[i]); } @@ -277,7 +162,152 @@ bool LCSSA::ProcessInstruction(Instruction *Inst, if (AddedPHIs[i]->use_empty()) AddedPHIs[i]->eraseFromParent(); } - + return true; } +/// Return true if the specified block dominates at least +/// one of the blocks in the specified list. +static bool +blockDominatesAnExit(BasicBlock *BB, + DominatorTree &DT, + const SmallVectorImpl<BasicBlock *> &ExitBlocks) { + DomTreeNode *DomNode = DT.getNode(BB); + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) + if (DT.dominates(DomNode, DT.getNode(ExitBlocks[i]))) + return true; + + return false; +} + +bool llvm::formLCSSA(Loop &L, DominatorTree &DT, ScalarEvolution *SE) { + bool Changed = false; + + // Get the set of exiting blocks. + SmallVector<BasicBlock *, 8> ExitBlocks; + L.getExitBlocks(ExitBlocks); + + if (ExitBlocks.empty()) + return false; + + PredIteratorCache PredCache; + + // Look at all the instructions in the loop, checking to see if they have uses + // outside the loop. If so, rewrite those uses. + for (Loop::block_iterator BBI = L.block_begin(), BBE = L.block_end(); + BBI != BBE; ++BBI) { + BasicBlock *BB = *BBI; + + // For large loops, avoid use-scanning by using dominance information: In + // particular, if a block does not dominate any of the loop exits, then none + // of the values defined in the block could be used outside the loop. + if (!blockDominatesAnExit(BB, DT, ExitBlocks)) + continue; + + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + // Reject two common cases fast: instructions with no uses (like stores) + // and instructions with one use that is in the same block as this. + if (I->use_empty() || + (I->hasOneUse() && I->user_back()->getParent() == BB && + !isa<PHINode>(I->user_back()))) + continue; + + Changed |= processInstruction(L, *I, DT, ExitBlocks, PredCache); + } + } + + // If we modified the code, remove any caches about the loop from SCEV to + // avoid dangling entries. + // FIXME: This is a big hammer, can we clear the cache more selectively? + if (SE && Changed) + SE->forgetLoop(&L); + + assert(L.isLCSSAForm(DT)); + + return Changed; +} + +/// Process a loop nest depth first. +bool llvm::formLCSSARecursively(Loop &L, DominatorTree &DT, + ScalarEvolution *SE) { + bool Changed = false; + + // Recurse depth-first through inner loops. + for (Loop::iterator LI = L.begin(), LE = L.end(); LI != LE; ++LI) + Changed |= formLCSSARecursively(**LI, DT, SE); + + Changed |= formLCSSA(L, DT, SE); + return Changed; +} + +namespace { +struct LCSSA : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + LCSSA() : FunctionPass(ID) { + initializeLCSSAPass(*PassRegistry::getPassRegistry()); + } + + // Cached analysis information for the current function. + DominatorTree *DT; + LoopInfo *LI; + ScalarEvolution *SE; + + bool runOnFunction(Function &F) override; + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG. It maintains both of these, + /// as well as the CFG. It also requires dominator information. + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfo>(); + AU.addPreservedID(LoopSimplifyID); + AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<ScalarEvolution>(); + } + +private: + void verifyAnalysis() const override; +}; +} + +char LCSSA::ID = 0; +INITIALIZE_PASS_BEGIN(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_END(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false) + +Pass *llvm::createLCSSAPass() { return new LCSSA(); } +char &llvm::LCSSAID = LCSSA::ID; + + +/// Process all loops in the function, inner-most out. +bool LCSSA::runOnFunction(Function &F) { + bool Changed = false; + LI = &getAnalysis<LoopInfo>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + SE = getAnalysisIfAvailable<ScalarEvolution>(); + + // Simplify each loop nest in the function. + for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) + Changed |= formLCSSARecursively(**I, *DT, SE); + + return Changed; +} + +static void verifyLoop(Loop &L, DominatorTree &DT) { + // Recurse depth-first through inner loops. + for (Loop::iterator LI = L.begin(), LE = L.end(); LI != LE; ++LI) + verifyLoop(**LI, DT); + + // Check the special guarantees that LCSSA makes. + //assert(L.isLCSSAForm(DT) && "LCSSA form not preserved!"); +} + +void LCSSA::verifyAnalysis() const { + // Verify each loop nest in the function, assuming LI still points at that + // function's loop info. + for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) + verifyLoop(**I, *DT); +} diff --git a/contrib/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm/lib/Transforms/Utils/Local.cpp index 2768041..a5e443f 100644 --- a/contrib/llvm/lib/Transforms/Utils/Local.cpp +++ b/contrib/llvm/lib/Transforms/Utils/Local.cpp @@ -17,15 +17,17 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/DIBuilder.h" -#include "llvm/DebugInfo.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" @@ -35,14 +37,14 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Operator.h" -#include "llvm/Support/CFG.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; +#define DEBUG_TYPE "local" + STATISTIC(NumRemoved, "Number of unreachable basic blocks removed"); //===----------------------------------------------------------------------===// @@ -127,8 +129,10 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, // dest. If so, eliminate it as an explicit compare. if (i.getCaseSuccessor() == DefaultDest) { MDNode* MD = SI->getMetadata(LLVMContext::MD_prof); - // MD should have 2 + NumCases operands. - if (MD && MD->getNumOperands() == 2 + SI->getNumCases()) { + unsigned NCases = SI->getNumCases(); + // Fold the case metadata into the default if there will be any branches + // left, unless the metadata doesn't match the switch. + if (NCases > 1 && MD && MD->getNumOperands() == 2 + NCases) { // Collect branch weights into a vector. SmallVector<uint32_t, 8> Weights; for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e; @@ -157,7 +161,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, // Otherwise, check to see if the switch only branches to one destination. // We do this by reseting "TheOnlyDest" to null when we find two non-equal // destinations. - if (i.getCaseSuccessor() != TheOnlyDest) TheOnlyDest = 0; + if (i.getCaseSuccessor() != TheOnlyDest) TheOnlyDest = nullptr; } if (CI && !TheOnlyDest) { @@ -178,7 +182,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, // Found case matching a constant operand? BasicBlock *Succ = SI->getSuccessor(i); if (Succ == TheOnlyDest) - TheOnlyDest = 0; // Don't modify the first branch to TheOnlyDest + TheOnlyDest = nullptr; // Don't modify the first branch to TheOnlyDest else Succ->removePredecessor(BB); } @@ -231,7 +235,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) { if (IBI->getDestination(i) == TheOnlyDest) - TheOnlyDest = 0; + TheOnlyDest = nullptr; else IBI->getDestination(i)->removePredecessor(IBI->getParent()); } @@ -329,7 +333,7 @@ llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V, // dead as we go. for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { Value *OpV = I->getOperand(i); - I->setOperand(i, 0); + I->setOperand(i, nullptr); if (!OpV->use_empty()) continue; @@ -352,8 +356,8 @@ llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V, /// true when there are no uses or multiple uses that all refer to the same /// value. static bool areAllUsesEqual(Instruction *I) { - Value::use_iterator UI = I->use_begin(); - Value::use_iterator UE = I->use_end(); + Value::user_iterator UI = I->user_begin(); + Value::user_iterator UE = I->user_end(); if (UI == UE) return true; @@ -374,7 +378,7 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN, const TargetLibraryInfo *TLI) { SmallPtrSet<Instruction*, 4> Visited; for (Instruction *I = PN; areAllUsesEqual(I) && !I->mayHaveSideEffects(); - I = cast<Instruction>(*I->use_begin())) { + I = cast<Instruction>(*I->user_begin())) { if (I->use_empty()) return RecursivelyDeleteTriviallyDeadInstructions(I, TLI); @@ -505,12 +509,18 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) { PredBB->getTerminator()->eraseFromParent(); DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList()); + // If the PredBB is the entry block of the function, move DestBB up to + // become the entry block after we erase PredBB. + if (PredBB == &DestBB->getParent()->getEntryBlock()) + DestBB->moveAfter(PredBB); + if (P) { - DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>(); - if (DT) { - BasicBlock *PredBBIDom = DT->getNode(PredBB)->getIDom()->getBlock(); - DT->changeImmediateDominator(DestBB, PredBBIDom); - DT->eraseNode(PredBB); + if (DominatorTreeWrapperPass *DTWP = + P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { + DominatorTree &DT = DTWP->getDomTree(); + BasicBlock *PredBBIDom = DT.getNode(PredBB)->getIDom()->getBlock(); + DT.changeImmediateDominator(DestBB, PredBBIDom); + DT.eraseNode(PredBB); } } // Nuke BB. @@ -749,10 +759,9 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) { if (!Succ->getSinglePredecessor()) { BasicBlock::iterator BBI = BB->begin(); while (isa<PHINode>(*BBI)) { - for (Value::use_iterator UI = BBI->use_begin(), E = BBI->use_end(); - UI != E; ++UI) { - if (PHINode* PN = dyn_cast<PHINode>(*UI)) { - if (PN->getIncomingBlock(UI) != BB) + for (Use &U : BBI->uses()) { + if (PHINode* PN = dyn_cast<PHINode>(U.getUser())) { + if (PN->getIncomingBlock(U) != BB) return false; } else { return false; @@ -892,24 +901,26 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align, return PrefAlign; } - if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) { + if (auto *GO = dyn_cast<GlobalObject>(V)) { // If there is a large requested alignment and we can, bump up the alignment // of the global. - if (GV->isDeclaration()) return Align; + if (GO->isDeclaration()) + return Align; // If the memory we set aside for the global may not be the memory used by // the final program then it is impossible for us to reliably enforce the // preferred alignment. - if (GV->isWeakForLinker()) return Align; + if (GO->isWeakForLinker()) + return Align; - if (GV->getAlignment() >= PrefAlign) - return GV->getAlignment(); + if (GO->getAlignment() >= PrefAlign) + return GO->getAlignment(); // We can only increase the alignment of the global if it has no alignment // specified or if it is not assigned a section. If it is assigned a // section, the global could be densely packed with other objects in the // section, increasing the alignment could cause padding issues. - if (!GV->hasSection() || GV->getAlignment() == 0) - GV->setAlignment(PrefAlign); - return GV->getAlignment(); + if (!GO->hasSection() || GO->getAlignment() == 0) + GO->setAlignment(PrefAlign); + return GO->getAlignment(); } return Align; @@ -926,7 +937,7 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, unsigned BitWidth = DL ? DL->getPointerTypeSizeInBits(V->getType()) : 64; APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - ComputeMaskedBits(V, KnownZero, KnownOne, DL); + computeKnownBits(V, KnownZero, KnownOne, DL); unsigned TrailZ = KnownZero.countTrailingOnes(); // Avoid trouble with ridiculously large TrailZ values, such as @@ -979,10 +990,10 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, if (LdStHasDebugValue(DIVar, SI)) return true; - Instruction *DbgVal = NULL; + Instruction *DbgVal = nullptr; // If an argument is zero extended then use argument directly. The ZExt // may be zapped by an optimization pass in future. - Argument *ExtendedArg = NULL; + Argument *ExtendedArg = nullptr; if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) ExtendedArg = dyn_cast<Argument>(ZExt->getOperand(0)); if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) @@ -991,14 +1002,7 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, DbgVal = Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, SI); else DbgVal = Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, SI); - - // Propagate any debug metadata from the store onto the dbg.value. - DebugLoc SIDL = SI->getDebugLoc(); - if (!SIDL.isUnknown()) - DbgVal->setDebugLoc(SIDL); - // Otherwise propagate debug metadata from dbg.declare. - else - DbgVal->setDebugLoc(DDI->getDebugLoc()); + DbgVal->setDebugLoc(DDI->getDebugLoc()); return true; } @@ -1018,51 +1022,54 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, Instruction *DbgVal = Builder.insertDbgValueIntrinsic(LI->getOperand(0), 0, DIVar, LI); - - // Propagate any debug metadata from the store onto the dbg.value. - DebugLoc LIDL = LI->getDebugLoc(); - if (!LIDL.isUnknown()) - DbgVal->setDebugLoc(LIDL); - // Otherwise propagate debug metadata from dbg.declare. - else - DbgVal->setDebugLoc(DDI->getDebugLoc()); + DbgVal->setDebugLoc(DDI->getDebugLoc()); return true; } +/// Determine whether this alloca is either a VLA or an array. +static bool isArray(AllocaInst *AI) { + return AI->isArrayAllocation() || + AI->getType()->getElementType()->isArrayTy(); +} + /// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set /// of llvm.dbg.value intrinsics. bool llvm::LowerDbgDeclare(Function &F) { DIBuilder DIB(*F.getParent()); SmallVector<DbgDeclareInst *, 4> Dbgs; - for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) - for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) { - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(BI)) + for (auto &FI : F) + for (BasicBlock::iterator BI : FI) + if (auto DDI = dyn_cast<DbgDeclareInst>(BI)) Dbgs.push_back(DDI); - } + if (Dbgs.empty()) return false; - for (SmallVectorImpl<DbgDeclareInst *>::iterator I = Dbgs.begin(), - E = Dbgs.end(); I != E; ++I) { - DbgDeclareInst *DDI = *I; + for (auto &I : Dbgs) { + DbgDeclareInst *DDI = I; AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress()); // If this is an alloca for a scalar variable, insert a dbg.value // at each load and store to the alloca and erase the dbg.declare. - if (AI && !AI->isArrayAllocation()) { - - // We only remove the dbg.declare intrinsic if all uses are - // converted to dbg.value intrinsics. - bool RemoveDDI = true; - for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); - UI != E; ++UI) - if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) + // The dbg.values allow tracking a variable even if it is not + // stored on the stack, while the dbg.declare can only describe + // the stack slot (and at a lexical-scope granularity). Later + // passes will attempt to elide the stack slot. + if (AI && !isArray(AI)) { + for (User *U : AI->users()) + if (StoreInst *SI = dyn_cast<StoreInst>(U)) ConvertDebugDeclareToDebugValue(DDI, SI, DIB); - else if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) + else if (LoadInst *LI = dyn_cast<LoadInst>(U)) ConvertDebugDeclareToDebugValue(DDI, LI, DIB); - else - RemoveDDI = false; - if (RemoveDDI) - DDI->eraseFromParent(); + else if (CallInst *CI = dyn_cast<CallInst>(U)) { + // This is a call by-value or some other instruction that + // takes a pointer to the variable. Insert a *value* + // intrinsic that describes the alloca. + auto DbgVal = + DIB.insertDbgValueIntrinsic(AI, 0, + DIVariable(DDI->getVariable()), CI); + DbgVal->setDebugLoc(DDI->getDebugLoc()); + } + DDI->eraseFromParent(); } } return true; @@ -1072,12 +1079,11 @@ bool llvm::LowerDbgDeclare(Function &F) { /// alloca 'V', if any. DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) { if (MDNode *DebugNode = MDNode::getIfExists(V->getContext(), V)) - for (Value::use_iterator UI = DebugNode->use_begin(), - E = DebugNode->use_end(); UI != E; ++UI) - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) + for (User *U : DebugNode->users()) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) return DDI; - return 0; + return nullptr; } bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp index 6d5f16c..ef42291 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -37,331 +37,72 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-simplify" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/DependenceAnalysis.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Type.h" -#include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; +#define DEBUG_TYPE "loop-simplify" + STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted"); STATISTIC(NumNested , "Number of nested loops split out"); -namespace { - struct LoopSimplify : public LoopPass { - static char ID; // Pass identification, replacement for typeid - LoopSimplify() : LoopPass(ID) { - initializeLoopSimplifyPass(*PassRegistry::getPassRegistry()); - } - - // AA - If we have an alias analysis object to update, this is it, otherwise - // this is null. - AliasAnalysis *AA; - LoopInfo *LI; - DominatorTree *DT; - ScalarEvolution *SE; - Loop *L; - virtual bool runOnLoop(Loop *L, LPPassManager &LPM); - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - // We need loop information to identify the loops... - AU.addRequired<DominatorTree>(); - AU.addPreserved<DominatorTree>(); - - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); - - AU.addPreserved<AliasAnalysis>(); - AU.addPreserved<ScalarEvolution>(); - AU.addPreserved<DependenceAnalysis>(); - AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added. - } - - /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees. - void verifyAnalysis() const; - - private: - bool ProcessLoop(Loop *L, LPPassManager &LPM); - BasicBlock *RewriteLoopExitBlock(Loop *L, BasicBlock *Exit); - Loop *SeparateNestedLoop(Loop *L, LPPassManager &LPM, - BasicBlock *Preheader); - BasicBlock *InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader); - }; -} - -static void PlaceSplitBlockCarefully(BasicBlock *NewBB, - SmallVectorImpl<BasicBlock*> &SplitPreds, - Loop *L); - -char LoopSimplify::ID = 0; -INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify", - "Canonicalize natural loops", true, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) -INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", - "Canonicalize natural loops", true, false) - -// Publicly exposed interface to pass... -char &llvm::LoopSimplifyID = LoopSimplify::ID; -Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); } - -/// runOnLoop - Run down all loops in the CFG (recursively, but we could do -/// it in any convenient order) inserting preheaders... -/// -bool LoopSimplify::runOnLoop(Loop *l, LPPassManager &LPM) { - L = l; - bool Changed = false; - LI = &getAnalysis<LoopInfo>(); - AA = getAnalysisIfAvailable<AliasAnalysis>(); - DT = &getAnalysis<DominatorTree>(); - SE = getAnalysisIfAvailable<ScalarEvolution>(); - - Changed |= ProcessLoop(L, LPM); - - return Changed; -} - -/// ProcessLoop - Walk the loop structure in depth first order, ensuring that -/// all loops have preheaders. -/// -bool LoopSimplify::ProcessLoop(Loop *L, LPPassManager &LPM) { - bool Changed = false; -ReprocessLoop: - - // Check to see that no blocks (other than the header) in this loop have - // predecessors that are not in the loop. This is not valid for natural - // loops, but can occur if the blocks are unreachable. Since they are - // unreachable we can just shamelessly delete those CFG edges! - for (Loop::block_iterator BB = L->block_begin(), E = L->block_end(); - BB != E; ++BB) { - if (*BB == L->getHeader()) continue; - - SmallPtrSet<BasicBlock*, 4> BadPreds; - for (pred_iterator PI = pred_begin(*BB), - PE = pred_end(*BB); PI != PE; ++PI) { - BasicBlock *P = *PI; - if (!L->contains(P)) - BadPreds.insert(P); - } - - // Delete each unique out-of-loop (and thus dead) predecessor. - for (SmallPtrSet<BasicBlock*, 4>::iterator I = BadPreds.begin(), - E = BadPreds.end(); I != E; ++I) { - - DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor " - << (*I)->getName() << "\n"); - - // Inform each successor of each dead pred. - for (succ_iterator SI = succ_begin(*I), SE = succ_end(*I); SI != SE; ++SI) - (*SI)->removePredecessor(*I); - // Zap the dead pred's terminator and replace it with unreachable. - TerminatorInst *TI = (*I)->getTerminator(); - TI->replaceAllUsesWith(UndefValue::get(TI->getType())); - (*I)->getTerminator()->eraseFromParent(); - new UnreachableInst((*I)->getContext(), *I); - Changed = true; - } - } - - // If there are exiting blocks with branches on undef, resolve the undef in - // the direction which will exit the loop. This will help simplify loop - // trip count computations. - SmallVector<BasicBlock*, 8> ExitingBlocks; - L->getExitingBlocks(ExitingBlocks); - for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(), - E = ExitingBlocks.end(); I != E; ++I) - if (BranchInst *BI = dyn_cast<BranchInst>((*I)->getTerminator())) - if (BI->isConditional()) { - if (UndefValue *Cond = dyn_cast<UndefValue>(BI->getCondition())) { - - DEBUG(dbgs() << "LoopSimplify: Resolving \"br i1 undef\" to exit in " - << (*I)->getName() << "\n"); - - BI->setCondition(ConstantInt::get(Cond->getType(), - !L->contains(BI->getSuccessor(0)))); - - // This may make the loop analyzable, force SCEV recomputation. - if (SE) - SE->forgetLoop(L); - - Changed = true; - } - } - - // Does the loop already have a preheader? If so, don't insert one. - BasicBlock *Preheader = L->getLoopPreheader(); - if (!Preheader) { - Preheader = InsertPreheaderForLoop(L, this); - if (Preheader) { - ++NumInserted; - Changed = true; - } - } - - // Next, check to make sure that all exit nodes of the loop only have - // predecessors that are inside of the loop. This check guarantees that the - // loop preheader/header will dominate the exit blocks. If the exit block has - // predecessors from outside of the loop, split the edge now. - SmallVector<BasicBlock*, 8> ExitBlocks; - L->getExitBlocks(ExitBlocks); - - SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(), - ExitBlocks.end()); - for (SmallSetVector<BasicBlock *, 8>::iterator I = ExitBlockSet.begin(), - E = ExitBlockSet.end(); I != E; ++I) { - BasicBlock *ExitBlock = *I; - for (pred_iterator PI = pred_begin(ExitBlock), PE = pred_end(ExitBlock); - PI != PE; ++PI) - // Must be exactly this loop: no subloops, parent loops, or non-loop preds - // allowed. - if (!L->contains(*PI)) { - if (RewriteLoopExitBlock(L, ExitBlock)) { - ++NumInserted; - Changed = true; - } - break; - } - } - - // If the header has more than two predecessors at this point (from the - // preheader and from multiple backedges), we must adjust the loop. - BasicBlock *LoopLatch = L->getLoopLatch(); - if (!LoopLatch) { - // If this is really a nested loop, rip it out into a child loop. Don't do - // this for loops with a giant number of backedges, just factor them into a - // common backedge instead. - if (L->getNumBackEdges() < 8) { - if (SeparateNestedLoop(L, LPM, Preheader)) { - ++NumNested; - // This is a big restructuring change, reprocess the whole loop. - Changed = true; - // GCC doesn't tail recursion eliminate this. - goto ReprocessLoop; - } - } - - // If we either couldn't, or didn't want to, identify nesting of the loops, - // insert a new block that all backedges target, then make it jump to the - // loop header. - LoopLatch = InsertUniqueBackedgeBlock(L, Preheader); - if (LoopLatch) { - ++NumInserted; - Changed = true; - } +// If the block isn't already, move the new block to right after some 'outside +// block' block. This prevents the preheader from being placed inside the loop +// body, e.g. when the loop hasn't been rotated. +static void placeSplitBlockCarefully(BasicBlock *NewBB, + SmallVectorImpl<BasicBlock *> &SplitPreds, + Loop *L) { + // Check to see if NewBB is already well placed. + Function::iterator BBI = NewBB; --BBI; + for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) { + if (&*BBI == SplitPreds[i]) + return; } - // Scan over the PHI nodes in the loop header. Since they now have only two - // incoming values (the loop is canonicalized), we may have simplified the PHI - // down to 'X = phi [X, Y]', which should be replaced with 'Y'. - PHINode *PN; - for (BasicBlock::iterator I = L->getHeader()->begin(); - (PN = dyn_cast<PHINode>(I++)); ) - if (Value *V = SimplifyInstruction(PN, 0, 0, DT)) { - if (AA) AA->deleteValue(PN); - if (SE) SE->forgetValue(PN); - PN->replaceAllUsesWith(V); - PN->eraseFromParent(); - } - - // If this loop has multiple exits and the exits all go to the same - // block, attempt to merge the exits. This helps several passes, such - // as LoopRotation, which do not support loops with multiple exits. - // SimplifyCFG also does this (and this code uses the same utility - // function), however this code is loop-aware, where SimplifyCFG is - // not. That gives it the advantage of being able to hoist - // loop-invariant instructions out of the way to open up more - // opportunities, and the disadvantage of having the responsibility - // to preserve dominator information. - bool UniqueExit = true; - if (!ExitBlocks.empty()) - for (unsigned i = 1, e = ExitBlocks.size(); i != e; ++i) - if (ExitBlocks[i] != ExitBlocks[0]) { - UniqueExit = false; - break; - } - if (UniqueExit) { - for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { - BasicBlock *ExitingBlock = ExitingBlocks[i]; - if (!ExitingBlock->getSinglePredecessor()) continue; - BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator()); - if (!BI || !BI->isConditional()) continue; - CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition()); - if (!CI || CI->getParent() != ExitingBlock) continue; - - // Attempt to hoist out all instructions except for the - // comparison and the branch. - bool AllInvariant = true; - for (BasicBlock::iterator I = ExitingBlock->begin(); &*I != BI; ) { - Instruction *Inst = I++; - // Skip debug info intrinsics. - if (isa<DbgInfoIntrinsic>(Inst)) - continue; - if (Inst == CI) - continue; - if (!L->makeLoopInvariant(Inst, Changed, - Preheader ? Preheader->getTerminator() : 0)) { - AllInvariant = false; - break; - } - } - if (!AllInvariant) continue; - - // The block has now been cleared of all instructions except for - // a comparison and a conditional branch. SimplifyCFG may be able - // to fold it now. - if (!FoldBranchToCommonDest(BI)) continue; - - // Success. The block is now dead, so remove it from the loop, - // update the dominator tree and delete it. - DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block " - << ExitingBlock->getName() << "\n"); - - // If any reachable control flow within this loop has changed, notify - // ScalarEvolution. Currently assume the parent loop doesn't change - // (spliting edges doesn't count). If blocks, CFG edges, or other values - // in the parent loop change, then we need call to forgetLoop() for the - // parent instead. - if (SE) - SE->forgetLoop(L); - - assert(pred_begin(ExitingBlock) == pred_end(ExitingBlock)); - Changed = true; - LI->removeBlock(ExitingBlock); - - DomTreeNode *Node = DT->getNode(ExitingBlock); - const std::vector<DomTreeNodeBase<BasicBlock> *> &Children = - Node->getChildren(); - while (!Children.empty()) { - DomTreeNode *Child = Children.front(); - DT->changeImmediateDominator(Child, Node->getIDom()); - } - DT->eraseNode(ExitingBlock); + // If it isn't already after an outside block, move it after one. This is + // always good as it makes the uncond branch from the outside block into a + // fall-through. - BI->getSuccessor(0)->removePredecessor(ExitingBlock); - BI->getSuccessor(1)->removePredecessor(ExitingBlock); - ExitingBlock->eraseFromParent(); + // Figure out *which* outside block to put this after. Prefer an outside + // block that neighbors a BB actually in the loop. + BasicBlock *FoundBB = nullptr; + for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) { + Function::iterator BBI = SplitPreds[i]; + if (++BBI != NewBB->getParent()->end() && + L->contains(BBI)) { + FoundBB = SplitPreds[i]; + break; } } - return Changed; + // If our heuristic for a *good* bb to place this after doesn't find + // anything, just pick something. It's likely better than leaving it within + // the loop. + if (!FoundBB) + FoundBB = SplitPreds[0]; + NewBB->moveAfter(FoundBB); } /// InsertPreheaderForLoop - Once we discover that a loop doesn't have a @@ -380,7 +121,7 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) { // If the loop is branched to from an indirect branch, we won't // be able to fully transform the loop, because it prohibits // edge splitting. - if (isa<IndirectBrInst>(P->getTerminator())) return 0; + if (isa<IndirectBrInst>(P->getTerminator())) return nullptr; // Keep track of it. OutsideBlocks.push_back(P); @@ -406,38 +147,39 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) { // Make sure that NewBB is put someplace intelligent, which doesn't mess up // code layout too horribly. - PlaceSplitBlockCarefully(PreheaderBB, OutsideBlocks, L); + placeSplitBlockCarefully(PreheaderBB, OutsideBlocks, L); return PreheaderBB; } -/// RewriteLoopExitBlock - Ensure that the loop preheader dominates all exit -/// blocks. This method is used to split exit blocks that have predecessors -/// outside of the loop. -BasicBlock *LoopSimplify::RewriteLoopExitBlock(Loop *L, BasicBlock *Exit) { +/// \brief Ensure that the loop preheader dominates all exit blocks. +/// +/// This method is used to split exit blocks that have predecessors outside of +/// the loop. +static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit, Pass *PP) { SmallVector<BasicBlock*, 8> LoopBlocks; for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) { BasicBlock *P = *I; if (L->contains(P)) { // Don't do this if the loop is exited via an indirect branch. - if (isa<IndirectBrInst>(P->getTerminator())) return 0; + if (isa<IndirectBrInst>(P->getTerminator())) return nullptr; LoopBlocks.push_back(P); } } assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?"); - BasicBlock *NewExitBB = 0; + BasicBlock *NewExitBB = nullptr; if (Exit->isLandingPad()) { SmallVector<BasicBlock*, 2> NewBBs; SplitLandingPadPredecessors(Exit, ArrayRef<BasicBlock*>(&LoopBlocks[0], LoopBlocks.size()), ".loopexit", ".nonloopexit", - this, NewBBs); + PP, NewBBs); NewExitBB = NewBBs[0]; } else { - NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", this); + NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", PP); } DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block " @@ -445,33 +187,33 @@ BasicBlock *LoopSimplify::RewriteLoopExitBlock(Loop *L, BasicBlock *Exit) { return NewExitBB; } -/// AddBlockAndPredsToSet - Add the specified block, and all of its -/// predecessors, to the specified set, if it's not already in there. Stop -/// predecessor traversal when we reach StopBlock. -static void AddBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock, +/// Add the specified block, and all of its predecessors, to the specified set, +/// if it's not already in there. Stop predecessor traversal when we reach +/// StopBlock. +static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock, std::set<BasicBlock*> &Blocks) { - std::vector<BasicBlock *> WorkList; - WorkList.push_back(InputBB); + SmallVector<BasicBlock *, 8> Worklist; + Worklist.push_back(InputBB); do { - BasicBlock *BB = WorkList.back(); WorkList.pop_back(); + BasicBlock *BB = Worklist.pop_back_val(); if (Blocks.insert(BB).second && BB != StopBlock) // If BB is not already processed and it is not a stop block then // insert its predecessor in the work list for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) { BasicBlock *WBB = *I; - WorkList.push_back(WBB); + Worklist.push_back(WBB); } - } while(!WorkList.empty()); + } while (!Worklist.empty()); } -/// FindPHIToPartitionLoops - The first part of loop-nestification is to find a -/// PHI node that tells us how to partition the loops. -static PHINode *FindPHIToPartitionLoops(Loop *L, DominatorTree *DT, - AliasAnalysis *AA, LoopInfo *LI) { +/// \brief The first part of loop-nestification is to find a PHI node that tells +/// us how to partition the loops. +static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA, + DominatorTree *DT) { for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) { PHINode *PN = cast<PHINode>(I); ++I; - if (Value *V = SimplifyInstruction(PN, 0, 0, DT)) { + if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT)) { // This is a degenerate PHI already, don't modify it! PN->replaceAllUsesWith(V); if (AA) AA->deleteValue(PN); @@ -486,49 +228,13 @@ static PHINode *FindPHIToPartitionLoops(Loop *L, DominatorTree *DT, // We found something tasty to remove. return PN; } - return 0; + return nullptr; } -// PlaceSplitBlockCarefully - If the block isn't already, move the new block to -// right after some 'outside block' block. This prevents the preheader from -// being placed inside the loop body, e.g. when the loop hasn't been rotated. -void PlaceSplitBlockCarefully(BasicBlock *NewBB, - SmallVectorImpl<BasicBlock*> &SplitPreds, - Loop *L) { - // Check to see if NewBB is already well placed. - Function::iterator BBI = NewBB; --BBI; - for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) { - if (&*BBI == SplitPreds[i]) - return; - } - - // If it isn't already after an outside block, move it after one. This is - // always good as it makes the uncond branch from the outside block into a - // fall-through. - - // Figure out *which* outside block to put this after. Prefer an outside - // block that neighbors a BB actually in the loop. - BasicBlock *FoundBB = 0; - for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) { - Function::iterator BBI = SplitPreds[i]; - if (++BBI != NewBB->getParent()->end() && - L->contains(BBI)) { - FoundBB = SplitPreds[i]; - break; - } - } - - // If our heuristic for a *good* bb to place this after doesn't find - // anything, just pick something. It's likely better than leaving it within - // the loop. - if (!FoundBB) - FoundBB = SplitPreds[0]; - NewBB->moveAfter(FoundBB); -} - - -/// SeparateNestedLoop - If this loop has multiple backedges, try to pull one of -/// them out into a nested loop. This is important for code that looks like +/// \brief If this loop has multiple backedges, try to pull one of them out into +/// a nested loop. +/// +/// This is important for code that looks like /// this: /// /// Loop: @@ -544,18 +250,19 @@ void PlaceSplitBlockCarefully(BasicBlock *NewBB, /// If we are able to separate out a loop, return the new outer loop that was /// created. /// -Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM, - BasicBlock *Preheader) { +static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, + AliasAnalysis *AA, DominatorTree *DT, + LoopInfo *LI, ScalarEvolution *SE, Pass *PP) { // Don't try to separate loops without a preheader. if (!Preheader) - return 0; + return nullptr; // The header is not a landing pad; preheader insertion should ensure this. assert(!L->getHeader()->isLandingPad() && "Can't insert backedge to landing pad"); - PHINode *PN = FindPHIToPartitionLoops(L, DT, AA, LI); - if (PN == 0) return 0; // No known way to partition. + PHINode *PN = findPHIToPartitionLoops(L, AA, DT); + if (!PN) return nullptr; // No known way to partition. // Pull out all predecessors that have varying values in the loop. This // handles the case when a PHI node has multiple instances of itself as @@ -566,7 +273,7 @@ Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM, !L->contains(PN->getIncomingBlock(i))) { // We can't split indirectbr edges. if (isa<IndirectBrInst>(PN->getIncomingBlock(i)->getTerminator())) - return 0; + return nullptr; OuterLoopPreds.push_back(PN->getIncomingBlock(i)); } } @@ -580,11 +287,11 @@ Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM, BasicBlock *Header = L->getHeader(); BasicBlock *NewBB = - SplitBlockPredecessors(Header, OuterLoopPreds, ".outer", this); + SplitBlockPredecessors(Header, OuterLoopPreds, ".outer", PP); // Make sure that NewBB is put someplace intelligent, which doesn't mess up // code layout too horribly. - PlaceSplitBlockCarefully(NewBB, OuterLoopPreds, L); + placeSplitBlockCarefully(NewBB, OuterLoopPreds, L); // Create the new outer loop. Loop *NewOuter = new Loop(); @@ -598,9 +305,6 @@ Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM, // L is now a subloop of our outer loop. NewOuter->addChildLoop(L); - // Add the new loop to the pass manager queue. - LPM.insertLoopIntoQueue(NewOuter); - for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) NewOuter->addBlockEntry(*I); @@ -615,7 +319,7 @@ Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM, for (pred_iterator PI=pred_begin(Header), E = pred_end(Header); PI!=E; ++PI) { BasicBlock *P = *PI; if (DT->dominates(Header, P)) - AddBlockAndPredsToSet(P, Header, BlocksInL); + addBlockAndPredsToSet(P, Header, BlocksInL); } // Scan all of the loop children of L, moving them to OuterLoop if they are @@ -643,15 +347,15 @@ Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM, return NewOuter; } - - -/// InsertUniqueBackedgeBlock - This method is called when the specified loop -/// has more than one backedge in it. If this occurs, revector all of these -/// backedges to target a new basic block and have that block branch to the loop -/// header. This ensures that loops have exactly one backedge. +/// \brief This method is called when the specified loop has more than one +/// backedge in it. /// -BasicBlock * -LoopSimplify::InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader) { +/// If this occurs, revector all of these backedges to target a new basic block +/// and have that block branch to the loop header. This ensures that loops +/// have exactly one backedge. +static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, + AliasAnalysis *AA, + DominatorTree *DT, LoopInfo *LI) { assert(L->getNumBackEdges() > 1 && "Must have > 1 backedge!"); // Get information about the loop @@ -660,7 +364,7 @@ LoopSimplify::InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader) { // Unique backedge insertion currently depends on having a preheader. if (!Preheader) - return 0; + return nullptr; // The header is not a landing pad; preheader insertion should ensure this. assert(!Header->isLandingPad() && "Can't insert backedge to landing pad"); @@ -672,7 +376,7 @@ LoopSimplify::InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader) { // Indirectbr edges cannot be split, so we must fail if we find one. if (isa<IndirectBrInst>(P->getTerminator())) - return 0; + return nullptr; if (P != Preheader) BackedgeBlocks.push_back(P); } @@ -701,7 +405,7 @@ LoopSimplify::InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader) { // preheader over to the new PHI node. unsigned PreheaderIdx = ~0U; bool HasUniqueIncomingValue = true; - Value *UniqueValue = 0; + Value *UniqueValue = nullptr; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *IBB = PN->getIncomingBlock(i); Value *IV = PN->getIncomingValue(i); @@ -710,7 +414,7 @@ LoopSimplify::InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader) { } else { NewPN->addIncoming(IV, IBB); if (HasUniqueIncomingValue) { - if (UniqueValue == 0) + if (!UniqueValue) UniqueValue = IV; else if (UniqueValue != IV) HasUniqueIncomingValue = false; @@ -762,7 +466,350 @@ LoopSimplify::InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader) { return BEBlock; } -void LoopSimplify::verifyAnalysis() const { +/// \brief Simplify one loop and queue further loops for simplification. +/// +/// FIXME: Currently this accepts both lots of analyses that it uses and a raw +/// Pass pointer. The Pass pointer is used by numerous utilities to update +/// specific analyses. Rather than a pass it would be much cleaner and more +/// explicit if they accepted the analysis directly and then updated it. +static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist, + AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI, + ScalarEvolution *SE, Pass *PP, + const DataLayout *DL) { + bool Changed = false; +ReprocessLoop: + + // Check to see that no blocks (other than the header) in this loop have + // predecessors that are not in the loop. This is not valid for natural + // loops, but can occur if the blocks are unreachable. Since they are + // unreachable we can just shamelessly delete those CFG edges! + for (Loop::block_iterator BB = L->block_begin(), E = L->block_end(); + BB != E; ++BB) { + if (*BB == L->getHeader()) continue; + + SmallPtrSet<BasicBlock*, 4> BadPreds; + for (pred_iterator PI = pred_begin(*BB), + PE = pred_end(*BB); PI != PE; ++PI) { + BasicBlock *P = *PI; + if (!L->contains(P)) + BadPreds.insert(P); + } + + // Delete each unique out-of-loop (and thus dead) predecessor. + for (SmallPtrSet<BasicBlock*, 4>::iterator I = BadPreds.begin(), + E = BadPreds.end(); I != E; ++I) { + + DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor " + << (*I)->getName() << "\n"); + + // Inform each successor of each dead pred. + for (succ_iterator SI = succ_begin(*I), SE = succ_end(*I); SI != SE; ++SI) + (*SI)->removePredecessor(*I); + // Zap the dead pred's terminator and replace it with unreachable. + TerminatorInst *TI = (*I)->getTerminator(); + TI->replaceAllUsesWith(UndefValue::get(TI->getType())); + (*I)->getTerminator()->eraseFromParent(); + new UnreachableInst((*I)->getContext(), *I); + Changed = true; + } + } + + // If there are exiting blocks with branches on undef, resolve the undef in + // the direction which will exit the loop. This will help simplify loop + // trip count computations. + SmallVector<BasicBlock*, 8> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(), + E = ExitingBlocks.end(); I != E; ++I) + if (BranchInst *BI = dyn_cast<BranchInst>((*I)->getTerminator())) + if (BI->isConditional()) { + if (UndefValue *Cond = dyn_cast<UndefValue>(BI->getCondition())) { + + DEBUG(dbgs() << "LoopSimplify: Resolving \"br i1 undef\" to exit in " + << (*I)->getName() << "\n"); + + BI->setCondition(ConstantInt::get(Cond->getType(), + !L->contains(BI->getSuccessor(0)))); + + // This may make the loop analyzable, force SCEV recomputation. + if (SE) + SE->forgetLoop(L); + + Changed = true; + } + } + + // Does the loop already have a preheader? If so, don't insert one. + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) { + Preheader = InsertPreheaderForLoop(L, PP); + if (Preheader) { + ++NumInserted; + Changed = true; + } + } + + // Next, check to make sure that all exit nodes of the loop only have + // predecessors that are inside of the loop. This check guarantees that the + // loop preheader/header will dominate the exit blocks. If the exit block has + // predecessors from outside of the loop, split the edge now. + SmallVector<BasicBlock*, 8> ExitBlocks; + L->getExitBlocks(ExitBlocks); + + SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(), + ExitBlocks.end()); + for (SmallSetVector<BasicBlock *, 8>::iterator I = ExitBlockSet.begin(), + E = ExitBlockSet.end(); I != E; ++I) { + BasicBlock *ExitBlock = *I; + for (pred_iterator PI = pred_begin(ExitBlock), PE = pred_end(ExitBlock); + PI != PE; ++PI) + // Must be exactly this loop: no subloops, parent loops, or non-loop preds + // allowed. + if (!L->contains(*PI)) { + if (rewriteLoopExitBlock(L, ExitBlock, PP)) { + ++NumInserted; + Changed = true; + } + break; + } + } + + // If the header has more than two predecessors at this point (from the + // preheader and from multiple backedges), we must adjust the loop. + BasicBlock *LoopLatch = L->getLoopLatch(); + if (!LoopLatch) { + // If this is really a nested loop, rip it out into a child loop. Don't do + // this for loops with a giant number of backedges, just factor them into a + // common backedge instead. + if (L->getNumBackEdges() < 8) { + if (Loop *OuterL = separateNestedLoop(L, Preheader, AA, DT, LI, SE, PP)) { + ++NumNested; + // Enqueue the outer loop as it should be processed next in our + // depth-first nest walk. + Worklist.push_back(OuterL); + + // This is a big restructuring change, reprocess the whole loop. + Changed = true; + // GCC doesn't tail recursion eliminate this. + // FIXME: It isn't clear we can't rely on LLVM to TRE this. + goto ReprocessLoop; + } + } + + // If we either couldn't, or didn't want to, identify nesting of the loops, + // insert a new block that all backedges target, then make it jump to the + // loop header. + LoopLatch = insertUniqueBackedgeBlock(L, Preheader, AA, DT, LI); + if (LoopLatch) { + ++NumInserted; + Changed = true; + } + } + + // Scan over the PHI nodes in the loop header. Since they now have only two + // incoming values (the loop is canonicalized), we may have simplified the PHI + // down to 'X = phi [X, Y]', which should be replaced with 'Y'. + PHINode *PN; + for (BasicBlock::iterator I = L->getHeader()->begin(); + (PN = dyn_cast<PHINode>(I++)); ) + if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT)) { + if (AA) AA->deleteValue(PN); + if (SE) SE->forgetValue(PN); + PN->replaceAllUsesWith(V); + PN->eraseFromParent(); + } + + // If this loop has multiple exits and the exits all go to the same + // block, attempt to merge the exits. This helps several passes, such + // as LoopRotation, which do not support loops with multiple exits. + // SimplifyCFG also does this (and this code uses the same utility + // function), however this code is loop-aware, where SimplifyCFG is + // not. That gives it the advantage of being able to hoist + // loop-invariant instructions out of the way to open up more + // opportunities, and the disadvantage of having the responsibility + // to preserve dominator information. + bool UniqueExit = true; + if (!ExitBlocks.empty()) + for (unsigned i = 1, e = ExitBlocks.size(); i != e; ++i) + if (ExitBlocks[i] != ExitBlocks[0]) { + UniqueExit = false; + break; + } + if (UniqueExit) { + for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { + BasicBlock *ExitingBlock = ExitingBlocks[i]; + if (!ExitingBlock->getSinglePredecessor()) continue; + BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator()); + if (!BI || !BI->isConditional()) continue; + CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition()); + if (!CI || CI->getParent() != ExitingBlock) continue; + + // Attempt to hoist out all instructions except for the + // comparison and the branch. + bool AllInvariant = true; + bool AnyInvariant = false; + for (BasicBlock::iterator I = ExitingBlock->begin(); &*I != BI; ) { + Instruction *Inst = I++; + // Skip debug info intrinsics. + if (isa<DbgInfoIntrinsic>(Inst)) + continue; + if (Inst == CI) + continue; + if (!L->makeLoopInvariant(Inst, AnyInvariant, + Preheader ? Preheader->getTerminator() + : nullptr)) { + AllInvariant = false; + break; + } + } + if (AnyInvariant) { + Changed = true; + // The loop disposition of all SCEV expressions that depend on any + // hoisted values have also changed. + if (SE) + SE->forgetLoopDispositions(L); + } + if (!AllInvariant) continue; + + // The block has now been cleared of all instructions except for + // a comparison and a conditional branch. SimplifyCFG may be able + // to fold it now. + if (!FoldBranchToCommonDest(BI, DL)) continue; + + // Success. The block is now dead, so remove it from the loop, + // update the dominator tree and delete it. + DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block " + << ExitingBlock->getName() << "\n"); + + // Notify ScalarEvolution before deleting this block. Currently assume the + // parent loop doesn't change (spliting edges doesn't count). If blocks, + // CFG edges, or other values in the parent loop change, then we need call + // to forgetLoop() for the parent instead. + if (SE) + SE->forgetLoop(L); + + assert(pred_begin(ExitingBlock) == pred_end(ExitingBlock)); + Changed = true; + LI->removeBlock(ExitingBlock); + + DomTreeNode *Node = DT->getNode(ExitingBlock); + const std::vector<DomTreeNodeBase<BasicBlock> *> &Children = + Node->getChildren(); + while (!Children.empty()) { + DomTreeNode *Child = Children.front(); + DT->changeImmediateDominator(Child, Node->getIDom()); + } + DT->eraseNode(ExitingBlock); + + BI->getSuccessor(0)->removePredecessor(ExitingBlock); + BI->getSuccessor(1)->removePredecessor(ExitingBlock); + ExitingBlock->eraseFromParent(); + } + } + + return Changed; +} + +bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP, + AliasAnalysis *AA, ScalarEvolution *SE, + const DataLayout *DL) { + bool Changed = false; + + // Worklist maintains our depth-first queue of loops in this nest to process. + SmallVector<Loop *, 4> Worklist; + Worklist.push_back(L); + + // Walk the worklist from front to back, pushing newly found sub loops onto + // the back. This will let us process loops from back to front in depth-first + // order. We can use this simple process because loops form a tree. + for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) { + Loop *L2 = Worklist[Idx]; + for (Loop::iterator I = L2->begin(), E = L2->end(); I != E; ++I) + Worklist.push_back(*I); + } + + while (!Worklist.empty()) + Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, AA, DT, LI, + SE, PP, DL); + + return Changed; +} + +namespace { + struct LoopSimplify : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + LoopSimplify() : FunctionPass(ID) { + initializeLoopSimplifyPass(*PassRegistry::getPassRegistry()); + } + + // AA - If we have an alias analysis object to update, this is it, otherwise + // this is null. + AliasAnalysis *AA; + DominatorTree *DT; + LoopInfo *LI; + ScalarEvolution *SE; + const DataLayout *DL; + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + // We need loop information to identify the loops... + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + + AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<DependenceAnalysis>(); + AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added. + } + + /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees. + void verifyAnalysis() const override; + }; +} + +char LoopSimplify::ID = 0; +INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify", + "Canonicalize natural loops", true, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", + "Canonicalize natural loops", true, false) + +// Publicly exposed interface to pass... +char &llvm::LoopSimplifyID = LoopSimplify::ID; +Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); } + +/// runOnFunction - Run down all loops in the CFG (recursively, but we could do +/// it in any convenient order) inserting preheaders... +/// +bool LoopSimplify::runOnFunction(Function &F) { + bool Changed = false; + AA = getAnalysisIfAvailable<AliasAnalysis>(); + LI = &getAnalysis<LoopInfo>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + SE = getAnalysisIfAvailable<ScalarEvolution>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; + + // Simplify each loop nest in the function. + for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) + Changed |= simplifyLoop(*I, DT, LI, this, AA, SE, DL); + + return Changed; +} + +// FIXME: Restore this code when we re-enable verification in verifyAnalysis +// below. +#if 0 +static void verifyLoop(Loop *L) { + // Verify subloops. + for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) + verifyLoop(*I); + // It used to be possible to just assert L->isLoopSimplifyForm(), however // with the introduction of indirectbr, there are now cases where it's // not possible to transform a loop as necessary. We can at least check @@ -799,3 +846,15 @@ void LoopSimplify::verifyAnalysis() const { (void)HasIndBrExiting; } } +#endif + +void LoopSimplify::verifyAnalysis() const { + // FIXME: This routine is being called mid-way through the loop pass manager + // as loop passes destroy this analysis. That's actually fine, but we have no + // way of expressing that here. Once all of the passes that destroy this are + // hoisted out of the loop pass manager we can add back verification here. +#if 0 + for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) + verifyLoop(*I); +#endif +} diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 162807d..ab1c25a 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -16,22 +16,29 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-unroll" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" using namespace llvm; +#define DEBUG_TYPE "loop-unroll" + // TODO: Should these be here or in LoopUnroll? STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled"); STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)"); @@ -58,18 +65,23 @@ static inline void RemapInstruction(Instruction *I, /// FoldBlockIntoPredecessor - Folds a basic block into its predecessor if it /// only has one predecessor, and that predecessor only has one successor. -/// The LoopInfo Analysis that is passed will be kept consistent. -/// Returns the new combined block. -static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, - LPPassManager *LPM) { +/// The LoopInfo Analysis that is passed will be kept consistent. If folding is +/// successful references to the containing loop must be removed from +/// ScalarEvolution by calling ScalarEvolution::forgetLoop because SE may have +/// references to the eliminated BB. The argument ForgottenLoops contains a set +/// of loops that have already been forgotten to prevent redundant, expensive +/// calls to ScalarEvolution::forgetLoop. Returns the new combined block. +static BasicBlock * +FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM, + SmallPtrSetImpl<Loop *> &ForgottenLoops) { // Merge basic blocks into their predecessor if there is only one distinct // pred, and if there is only one distinct successor of the predecessor, and // if there are no PHI nodes. BasicBlock *OnlyPred = BB->getSinglePredecessor(); - if (!OnlyPred) return 0; + if (!OnlyPred) return nullptr; if (OnlyPred->getTerminator()->getNumSuccessors() != 1) - return 0; + return nullptr; DEBUG(dbgs() << "Merging: " << *BB << "into: " << *OnlyPred); @@ -98,8 +110,10 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, // ScalarEvolution holds references to loop exit blocks. if (LPM) { if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) { - if (Loop *L = LI->getLoopFor(BB)) - SE->forgetLoop(L); + if (Loop *L = LI->getLoopFor(BB)) { + if (ForgottenLoops.insert(L)) + SE->forgetLoop(L); + } } } LI->removeBlock(BB); @@ -137,10 +151,10 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, /// removed from the LoopPassManager as well. LPM can also be NULL. /// /// This utility preserves LoopInfo. If DominatorTree or ScalarEvolution are -/// available it must also preserve those analyses. +/// available from the Pass it must also preserve those analyses. bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool AllowRuntime, unsigned TripMultiple, - LoopInfo *LI, LPPassManager *LPM) { + LoopInfo *LI, Pass *PP, LPPassManager *LPM) { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n"); @@ -208,8 +222,8 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, // Notify ScalarEvolution that the loop will be substantially changed, // if not outright eliminated. - if (LPM) { - ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); + if (PP) { + ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>(); if (SE) SE->forgetLoop(L); } @@ -225,18 +239,35 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, (unsigned)GreatestCommonDivisor64(Count, TripMultiple); } + // Report the unrolling decision. + DebugLoc LoopLoc = L->getStartLoc(); + Function *F = Header->getParent(); + LLVMContext &Ctx = F->getContext(); + if (CompletelyUnroll) { DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName() << " with trip count " << TripCount << "!\n"); + emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc, + Twine("completely unrolled loop with ") + + Twine(TripCount) + " iterations"); } else { + auto EmitDiag = [&](const Twine &T) { + emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc, + "unrolled loop by a factor of " + Twine(Count) + + T); + }; + DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by " << Count); if (TripMultiple == 0 || BreakoutTrip != TripMultiple) { DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip); + EmitDiag(" with a breakout at trip " + Twine(BreakoutTrip)); } else if (TripMultiple != 1) { DEBUG(dbgs() << " with " << TripMultiple << " trips per branch"); + EmitDiag(" with " + Twine(TripMultiple) + " trips per branch"); } else if (RuntimeTripCount) { DEBUG(dbgs() << " with run-time trip count"); + EmitDiag(" with run-time trip count"); } DEBUG(dbgs() << "!\n"); } @@ -400,23 +431,29 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, } // Merge adjacent basic blocks, if possible. + SmallPtrSet<Loop *, 4> ForgottenLoops; for (unsigned i = 0, e = Latches.size(); i != e; ++i) { BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator()); if (Term->isUnconditional()) { BasicBlock *Dest = Term->getSuccessor(0); - if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, LPM)) + if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, LPM, + ForgottenLoops)) std::replace(Latches.begin(), Latches.end(), Dest, Fold); } } - if (LPM) { + DominatorTree *DT = nullptr; + if (PP) { // FIXME: Reconstruct dom info, because it is not preserved properly. // Incrementally updating domtree after loop unrolling would be easy. - if (DominatorTree *DT = LPM->getAnalysisIfAvailable<DominatorTree>()) - DT->runOnFunction(*L->getHeader()->getParent()); + if (DominatorTreeWrapperPass *DTWP = + PP->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { + DT = &DTWP->getDomTree(); + DT->recalculate(*L->getHeader()->getParent()); + } // Simplify any new induction variables in the partially unrolled loop. - ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); + ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>(); if (SE && !CompletelyUnroll) { SmallVector<WeakVH, 16> DeadInsts; simplifyLoopIVs(L, SE, LPM, DeadInsts); @@ -449,9 +486,36 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, NumCompletelyUnrolled += CompletelyUnroll; ++NumUnrolled; + + Loop *OuterL = L->getParentLoop(); // Remove the loop from the LoopPassManager if it's completely removed. - if (CompletelyUnroll && LPM != NULL) + if (CompletelyUnroll && LPM != nullptr) LPM->deleteLoopFromQueue(L); + // If we have a pass and a DominatorTree we should re-simplify impacted loops + // to ensure subsequent analyses can rely on this form. We want to simplify + // at least one layer outside of the loop that was unrolled so that any + // changes to the parent loop exposed by the unrolling are considered. + if (PP && DT) { + if (!OuterL && !CompletelyUnroll) + OuterL = L; + if (OuterL) { + DataLayoutPass *DLP = PP->getAnalysisIfAvailable<DataLayoutPass>(); + const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; + ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>(); + simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, DL); + + // LCSSA must be performed on the outermost affected loop. The unrolled + // loop's last loop latch is guaranteed to be in the outermost loop after + // deleteLoopFromQueue updates LoopInfo. + Loop *LatchLoop = LI->getLoopFor(Latches.back()); + if (!OuterL->contains(LatchLoop)) + while (OuterL->getParentLoop() != LatchLoop) + OuterL = OuterL->getParentLoop(); + + formLCSSARecursively(*OuterL, *DT, SE); + } + } + return true; } diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index d801d5f..a96c46a 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -21,7 +21,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "loop-unroll" #include "llvm/Transforms/Utils/UnrollLoop.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LoopIterator.h" @@ -37,6 +36,8 @@ using namespace llvm; +#define DEBUG_TYPE "loop-unroll" + STATISTIC(NumRuntimeUnrolled, "Number of loops unrolled with run-time trip counts"); @@ -58,7 +59,7 @@ static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count, BasicBlock *OrigPH, BasicBlock *NewPH, ValueToValueMapTy &LVMap, Pass *P) { BasicBlock *Latch = L->getLoopLatch(); - assert(Latch != 0 && "Loop must have a latch"); + assert(Latch && "Loop must have a latch"); // Create a PHI node for each outgoing value from the original loop // (which means it is an outgoing value from the prolog code too). @@ -110,7 +111,7 @@ static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count, new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, TripCount, ConstantInt::get(TripCount->getType(), Count)); BasicBlock *Exit = L->getUniqueExitBlock(); - assert(Exit != 0 && "Loop must have a single exit block only"); + assert(Exit && "Loop must have a single exit block only"); // Split the exit to maintain loop canonicalization guarantees SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit)); if (!Exit->isLandingPad()) { @@ -232,7 +233,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, // Make sure the loop is in canonical form, and there is a single // exit block only. - if (!L->isLoopSimplifyForm() || L->getUniqueExitBlock() == 0) + if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock()) return false; // Use Scalar Evolution to compute the trip count. This allows more @@ -240,7 +241,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, if (!LPM) return false; ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); - if (SE == 0) + if (!SE) return false; // Only unroll loops with a computable trip count and the trip count needs @@ -279,17 +280,17 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, SCEVExpander Expander(*SE, "loop-unroll"); Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(), PreHeaderBR); - Type *CountTy = TripCount->getType(); - BinaryOperator *ModVal = - BinaryOperator::CreateURem(TripCount, - ConstantInt::get(CountTy, Count), - "xtraiter"); - ModVal->insertBefore(PreHeaderBR); - - // Check if for no extra iterations, then jump to unrolled loop - Value *BranchVal = new ICmpInst(PreHeaderBR, - ICmpInst::ICMP_NE, ModVal, - ConstantInt::get(CountTy, 0), "lcmp"); + + IRBuilder<> B(PreHeaderBR); + Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter"); + + // Check if for no extra iterations, then jump to unrolled loop. We have to + // check that the trip count computation didn't overflow when adding one to + // the backedge taken count. + Value *LCmp = B.CreateIsNotNull(ModVal, "lcmp.mod"); + Value *OverflowCheck = B.CreateIsNull(TripCount, "lcmp.overflow"); + Value *BranchVal = B.CreateOr(OverflowCheck, LCmp, "lcmp.or"); + // Branch to either the extra iterations or the unrolled loop // We will fix up the true branch label when adding loop body copies BranchInst::Create(PEnd, PEnd, BranchVal, PreHeaderBR); @@ -301,7 +302,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, ValueToValueMapTy LVMap; Function *F = Header->getParent(); // These variables are used to update the CFG links in each iteration - BasicBlock *CompareBB = 0; + BasicBlock *CompareBB = nullptr; BasicBlock *LastLoopBB = PH; // Get an ordered list of blocks in the loop to help with the ordering of the // cloned blocks in the prolog code @@ -343,6 +344,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, } // The comparison w/ the extra iteration value and branch + Type *CountTy = TripCount->getType(); Value *BranchVal = new ICmpInst(*NewBB, ICmpInst::ICMP_EQ, ModVal, ConstantInt::get(CountTy, leftOverIters), "un.tmp"); diff --git a/contrib/llvm/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Utils/LowerExpectIntrinsic.cpp index e017f50..ff89e74 100644 --- a/contrib/llvm/lib/Transforms/Utils/LowerExpectIntrinsic.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LowerExpectIntrinsic.cpp @@ -11,7 +11,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "lower-expect-intrinsic" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/BasicBlock.h" @@ -29,6 +28,8 @@ using namespace llvm; +#define DEBUG_TYPE "lower-expect-intrinsic" + STATISTIC(IfHandled, "Number of 'expect' intrinsic instructions handled"); static cl::opt<uint32_t> @@ -52,7 +53,7 @@ namespace { initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; }; } @@ -94,15 +95,25 @@ bool LowerExpectIntrinsic::HandleIfExpect(BranchInst *BI) { return false; // Handle non-optimized IR code like: - // %expval = call i64 @llvm.expect.i64.i64(i64 %conv1, i64 1) + // %expval = call i64 @llvm.expect.i64(i64 %conv1, i64 1) // %tobool = icmp ne i64 %expval, 0 // br i1 %tobool, label %if.then, label %if.end + // + // Or the following simpler case: + // %expval = call i1 @llvm.expect.i1(i1 %cmp, i1 1) + // br i1 %expval, label %if.then, label %if.end + + CallInst *CI; ICmpInst *CmpI = dyn_cast<ICmpInst>(BI->getCondition()); - if (!CmpI || CmpI->getPredicate() != CmpInst::ICMP_NE) - return false; + if (!CmpI) { + CI = dyn_cast<CallInst>(BI->getCondition()); + } else { + if (CmpI->getPredicate() != CmpInst::ICMP_NE) + return false; + CI = dyn_cast<CallInst>(CmpI->getOperand(0)); + } - CallInst *CI = dyn_cast<CallInst>(CmpI->getOperand(0)); if (!CI) return false; @@ -127,7 +138,10 @@ bool LowerExpectIntrinsic::HandleIfExpect(BranchInst *BI) { BI->setMetadata(LLVMContext::MD_prof, Node); - CmpI->setOperand(0, ArgValue); + if (CmpI) + CmpI->setOperand(0, ArgValue); + else + BI->setCondition(ArgValue); return true; } diff --git a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp index 9799a30..66d57b0 100644 --- a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp @@ -1,4 +1,4 @@ -//===- LowerInvoke.cpp - Eliminate Invoke & Unwind instructions -----------===// +//===- LowerInvoke.cpp - Eliminate Invoke instructions --------------------===// // // The LLVM Compiler Infrastructure // @@ -8,94 +8,34 @@ //===----------------------------------------------------------------------===// // // This transformation is designed for use by code generators which do not yet -// support stack unwinding. This pass supports two models of exception handling -// lowering, the 'cheap' support and the 'expensive' support. -// -// 'Cheap' exception handling support gives the program the ability to execute -// any program which does not "throw an exception", by turning 'invoke' -// instructions into calls and by turning 'unwind' instructions into calls to -// abort(). If the program does dynamically use the unwind instruction, the -// program will print a message then abort. -// -// 'Expensive' exception handling support gives the full exception handling -// support to the program at the cost of making the 'invoke' instruction -// really expensive. It basically inserts setjmp/longjmp calls to emulate the -// exception handling as necessary. -// -// Because the 'expensive' support slows down programs a lot, and EH is only -// used for a subset of the programs, it must be specifically enabled by an -// option. -// -// Note that after this pass runs the CFG is not entirely accurate (exceptional -// control flow edges are not correct anymore) so only very simple things should -// be done after the lowerinvoke pass has run (like generation of native code). -// This should not be used as a general purpose "my LLVM-to-LLVM pass doesn't -// support the invoke instruction yet" lowering pass. +// support stack unwinding. This pass converts 'invoke' instructions to 'call' +// instructions, so that any exception-handling 'landingpad' blocks become dead +// code (which can be removed by running the '-simplifycfg' pass afterwards). // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "lowerinvoke" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" -#include <csetjmp> -#include <set> using namespace llvm; -STATISTIC(NumInvokes, "Number of invokes replaced"); -STATISTIC(NumSpilled, "Number of registers live across unwind edges"); +#define DEBUG_TYPE "lowerinvoke" -static cl::opt<bool> ExpensiveEHSupport("enable-correct-eh-support", - cl::desc("Make the -lowerinvoke pass insert expensive, but correct, EH code")); +STATISTIC(NumInvokes, "Number of invokes replaced"); namespace { class LowerInvoke : public FunctionPass { - const TargetMachine *TM; - - // Used for both models. - Constant *AbortFn; - - // Used for expensive EH support. - StructType *JBLinkTy; - GlobalVariable *JBListHead; - Constant *SetJmpFn, *LongJmpFn, *StackSaveFn, *StackRestoreFn; - bool useExpensiveEHSupport; - public: static char ID; // Pass identification, replacement for typeid - explicit LowerInvoke(const TargetMachine *TM = 0, - bool useExpensiveEHSupport = ExpensiveEHSupport) - : FunctionPass(ID), TM(TM), - useExpensiveEHSupport(useExpensiveEHSupport) { + explicit LowerInvoke() : FunctionPass(ID) { initializeLowerInvokePass(*PassRegistry::getPassRegistry()); } - bool doInitialization(Module &M); - bool runOnFunction(Function &F); - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - // This is a cluster of orthogonal Transforms - AU.addPreserved("mem2reg"); - AU.addPreservedID(LowerSwitchID); - } - - private: - bool insertCheapEHSupport(Function &F); - void splitLiveRangesLiveAcrossInvokes(SmallVectorImpl<InvokeInst*>&Invokes); - void rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo, - AllocaInst *InvokeNum, AllocaInst *StackPtr, - SwitchInst *CatchSwitch); - bool insertExpensiveEHSupport(Function &F); + bool runOnFunction(Function &F) override; }; } @@ -107,65 +47,11 @@ INITIALIZE_PASS(LowerInvoke, "lowerinvoke", char &llvm::LowerInvokePassID = LowerInvoke::ID; // Public Interface To the LowerInvoke pass. -FunctionPass *llvm::createLowerInvokePass(const TargetMachine *TM, - bool useExpensiveEHSupport) { - return new LowerInvoke(TM, useExpensiveEHSupport || ExpensiveEHSupport); +FunctionPass *llvm::createLowerInvokePass() { + return new LowerInvoke(); } -// doInitialization - Make sure that there is a prototype for abort in the -// current module. -bool LowerInvoke::doInitialization(Module &M) { - Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext()); - if (useExpensiveEHSupport) { - // Insert a type for the linked list of jump buffers. - const TargetLowering *TLI = TM ? TM->getTargetLowering() : 0; - unsigned JBSize = TLI ? TLI->getJumpBufSize() : 0; - JBSize = JBSize ? JBSize : 200; - Type *JmpBufTy = ArrayType::get(VoidPtrTy, JBSize); - - JBLinkTy = StructType::create(M.getContext(), "llvm.sjljeh.jmpbufty"); - Type *Elts[] = { JmpBufTy, PointerType::getUnqual(JBLinkTy) }; - JBLinkTy->setBody(Elts); - - Type *PtrJBList = PointerType::getUnqual(JBLinkTy); - - // Now that we've done that, insert the jmpbuf list head global, unless it - // already exists. - if (!(JBListHead = M.getGlobalVariable("llvm.sjljeh.jblist", PtrJBList))) { - JBListHead = new GlobalVariable(M, PtrJBList, false, - GlobalValue::LinkOnceAnyLinkage, - Constant::getNullValue(PtrJBList), - "llvm.sjljeh.jblist"); - } - -// VisualStudio defines setjmp as _setjmp -#if defined(_MSC_VER) && defined(setjmp) && \ - !defined(setjmp_undefined_for_msvc) -# pragma push_macro("setjmp") -# undef setjmp -# define setjmp_undefined_for_msvc -#endif - - SetJmpFn = Intrinsic::getDeclaration(&M, Intrinsic::setjmp); - -#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc) - // let's return it to _setjmp state -# pragma pop_macro("setjmp") -# undef setjmp_undefined_for_msvc -#endif - - LongJmpFn = Intrinsic::getDeclaration(&M, Intrinsic::longjmp); - StackSaveFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave); - StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore); - } - - // We need the 'write' and 'abort' functions for both models. - AbortFn = M.getOrInsertFunction("abort", Type::getVoidTy(M.getContext()), - (Type *)0); - return true; -} - -bool LowerInvoke::insertCheapEHSupport(Function &F) { +bool LowerInvoke::runOnFunction(Function &F) { bool Changed = false; for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) { @@ -192,388 +78,3 @@ bool LowerInvoke::insertCheapEHSupport(Function &F) { } return Changed; } - -/// rewriteExpensiveInvoke - Insert code and hack the function to replace the -/// specified invoke instruction with a call. -void LowerInvoke::rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo, - AllocaInst *InvokeNum, - AllocaInst *StackPtr, - SwitchInst *CatchSwitch) { - ConstantInt *InvokeNoC = ConstantInt::get(Type::getInt32Ty(II->getContext()), - InvokeNo); - - // If the unwind edge has phi nodes, split the edge. - if (isa<PHINode>(II->getUnwindDest()->begin())) { - SplitCriticalEdge(II, 1, this); - - // If there are any phi nodes left, they must have a single predecessor. - while (PHINode *PN = dyn_cast<PHINode>(II->getUnwindDest()->begin())) { - PN->replaceAllUsesWith(PN->getIncomingValue(0)); - PN->eraseFromParent(); - } - } - - // Insert a store of the invoke num before the invoke and store zero into the - // location afterward. - new StoreInst(InvokeNoC, InvokeNum, true, II); // volatile - - // Insert a store of the stack ptr before the invoke, so we can restore it - // later in the exception case. - CallInst* StackSaveRet = CallInst::Create(StackSaveFn, "ssret", II); - new StoreInst(StackSaveRet, StackPtr, true, II); // volatile - - BasicBlock::iterator NI = II->getNormalDest()->getFirstInsertionPt(); - // nonvolatile. - new StoreInst(Constant::getNullValue(Type::getInt32Ty(II->getContext())), - InvokeNum, false, NI); - - Instruction* StackPtrLoad = - new LoadInst(StackPtr, "stackptr.restore", true, - II->getUnwindDest()->getFirstInsertionPt()); - CallInst::Create(StackRestoreFn, StackPtrLoad, "")->insertAfter(StackPtrLoad); - - // Add a switch case to our unwind block. - CatchSwitch->addCase(InvokeNoC, II->getUnwindDest()); - - // Insert a normal call instruction. - SmallVector<Value*,16> CallArgs(II->op_begin(), II->op_end() - 3); - CallInst *NewCall = CallInst::Create(II->getCalledValue(), - CallArgs, "", II); - NewCall->takeName(II); - NewCall->setCallingConv(II->getCallingConv()); - NewCall->setAttributes(II->getAttributes()); - NewCall->setDebugLoc(II->getDebugLoc()); - II->replaceAllUsesWith(NewCall); - - // Replace the invoke with an uncond branch. - BranchInst::Create(II->getNormalDest(), NewCall->getParent()); - II->eraseFromParent(); -} - -/// MarkBlocksLiveIn - Insert BB and all of its predescessors into LiveBBs until -/// we reach blocks we've already seen. -static void MarkBlocksLiveIn(BasicBlock *BB, std::set<BasicBlock*> &LiveBBs) { - if (!LiveBBs.insert(BB).second) return; // already been here. - - for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) - MarkBlocksLiveIn(*PI, LiveBBs); -} - -// First thing we need to do is scan the whole function for values that are -// live across unwind edges. Each value that is live across an unwind edge -// we spill into a stack location, guaranteeing that there is nothing live -// across the unwind edge. This process also splits all critical edges -// coming out of invoke's. -void LowerInvoke:: -splitLiveRangesLiveAcrossInvokes(SmallVectorImpl<InvokeInst*> &Invokes) { - // First step, split all critical edges from invoke instructions. - for (unsigned i = 0, e = Invokes.size(); i != e; ++i) { - InvokeInst *II = Invokes[i]; - SplitCriticalEdge(II, 0, this); - SplitCriticalEdge(II, 1, this); - assert(!isa<PHINode>(II->getNormalDest()) && - !isa<PHINode>(II->getUnwindDest()) && - "critical edge splitting left single entry phi nodes?"); - } - - Function *F = Invokes.back()->getParent()->getParent(); - - // To avoid having to handle incoming arguments specially, we lower each arg - // to a copy instruction in the entry block. This ensures that the argument - // value itself cannot be live across the entry block. - BasicBlock::iterator AfterAllocaInsertPt = F->begin()->begin(); - while (isa<AllocaInst>(AfterAllocaInsertPt) && - isa<ConstantInt>(cast<AllocaInst>(AfterAllocaInsertPt)->getArraySize())) - ++AfterAllocaInsertPt; - for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); - AI != E; ++AI) { - Type *Ty = AI->getType(); - // Aggregate types can't be cast, but are legal argument types, so we have - // to handle them differently. We use an extract/insert pair as a - // lightweight method to achieve the same goal. - if (isa<StructType>(Ty) || isa<ArrayType>(Ty) || isa<VectorType>(Ty)) { - Instruction *EI = ExtractValueInst::Create(AI, 0, "",AfterAllocaInsertPt); - Instruction *NI = InsertValueInst::Create(AI, EI, 0); - NI->insertAfter(EI); - AI->replaceAllUsesWith(NI); - // Set the operand of the instructions back to the AllocaInst. - EI->setOperand(0, AI); - NI->setOperand(0, AI); - } else { - // This is always a no-op cast because we're casting AI to AI->getType() - // so src and destination types are identical. BitCast is the only - // possibility. - CastInst *NC = new BitCastInst( - AI, AI->getType(), AI->getName()+".tmp", AfterAllocaInsertPt); - AI->replaceAllUsesWith(NC); - // Set the operand of the cast instruction back to the AllocaInst. - // Normally it's forbidden to replace a CastInst's operand because it - // could cause the opcode to reflect an illegal conversion. However, - // we're replacing it here with the same value it was constructed with. - // We do this because the above replaceAllUsesWith() clobbered the - // operand, but we want this one to remain. - NC->setOperand(0, AI); - } - } - - // Finally, scan the code looking for instructions with bad live ranges. - for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) - for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) { - // Ignore obvious cases we don't have to handle. In particular, most - // instructions either have no uses or only have a single use inside the - // current block. Ignore them quickly. - Instruction *Inst = II; - if (Inst->use_empty()) continue; - if (Inst->hasOneUse() && - cast<Instruction>(Inst->use_back())->getParent() == BB && - !isa<PHINode>(Inst->use_back())) continue; - - // If this is an alloca in the entry block, it's not a real register - // value. - if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst)) - if (isa<ConstantInt>(AI->getArraySize()) && BB == F->begin()) - continue; - - // Avoid iterator invalidation by copying users to a temporary vector. - SmallVector<Instruction*,16> Users; - for (Value::use_iterator UI = Inst->use_begin(), E = Inst->use_end(); - UI != E; ++UI) { - Instruction *User = cast<Instruction>(*UI); - if (User->getParent() != BB || isa<PHINode>(User)) - Users.push_back(User); - } - - // Scan all of the uses and see if the live range is live across an unwind - // edge. If we find a use live across an invoke edge, create an alloca - // and spill the value. - - // Find all of the blocks that this value is live in. - std::set<BasicBlock*> LiveBBs; - LiveBBs.insert(Inst->getParent()); - while (!Users.empty()) { - Instruction *U = Users.back(); - Users.pop_back(); - - if (!isa<PHINode>(U)) { - MarkBlocksLiveIn(U->getParent(), LiveBBs); - } else { - // Uses for a PHI node occur in their predecessor block. - PHINode *PN = cast<PHINode>(U); - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (PN->getIncomingValue(i) == Inst) - MarkBlocksLiveIn(PN->getIncomingBlock(i), LiveBBs); - } - } - - // Now that we know all of the blocks that this thing is live in, see if - // it includes any of the unwind locations. - bool NeedsSpill = false; - for (unsigned i = 0, e = Invokes.size(); i != e; ++i) { - BasicBlock *UnwindBlock = Invokes[i]->getUnwindDest(); - if (UnwindBlock != BB && LiveBBs.count(UnwindBlock)) { - NeedsSpill = true; - } - } - - // If we decided we need a spill, do it. - if (NeedsSpill) { - ++NumSpilled; - DemoteRegToStack(*Inst, true); - } - } -} - -bool LowerInvoke::insertExpensiveEHSupport(Function &F) { - SmallVector<ReturnInst*,16> Returns; - SmallVector<InvokeInst*,16> Invokes; - UnreachableInst* UnreachablePlaceholder = 0; - - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) - if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) { - // Remember all return instructions in case we insert an invoke into this - // function. - Returns.push_back(RI); - } else if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) { - Invokes.push_back(II); - } - - if (Invokes.empty()) return false; - - NumInvokes += Invokes.size(); - - // TODO: This is not an optimal way to do this. In particular, this always - // inserts setjmp calls into the entries of functions with invoke instructions - // even though there are possibly paths through the function that do not - // execute any invokes. In particular, for functions with early exits, e.g. - // the 'addMove' method in hexxagon, it would be nice to not have to do the - // setjmp stuff on the early exit path. This requires a bit of dataflow, but - // would not be too hard to do. - - // If we have an invoke instruction, insert a setjmp that dominates all - // invokes. After the setjmp, use a cond branch that goes to the original - // code path on zero, and to a designated 'catch' block of nonzero. - Value *OldJmpBufPtr = 0; - if (!Invokes.empty()) { - // First thing we need to do is scan the whole function for values that are - // live across unwind edges. Each value that is live across an unwind edge - // we spill into a stack location, guaranteeing that there is nothing live - // across the unwind edge. This process also splits all critical edges - // coming out of invoke's. - splitLiveRangesLiveAcrossInvokes(Invokes); - - BasicBlock *EntryBB = F.begin(); - - // Create an alloca for the incoming jump buffer ptr and the new jump buffer - // that needs to be restored on all exits from the function. This is an - // alloca because the value needs to be live across invokes. - const TargetLowering *TLI = TM ? TM->getTargetLowering() : 0; - unsigned Align = TLI ? TLI->getJumpBufAlignment() : 0; - AllocaInst *JmpBuf = - new AllocaInst(JBLinkTy, 0, Align, - "jblink", F.begin()->begin()); - - Value *Idx[] = { Constant::getNullValue(Type::getInt32Ty(F.getContext())), - ConstantInt::get(Type::getInt32Ty(F.getContext()), 1) }; - OldJmpBufPtr = GetElementPtrInst::Create(JmpBuf, Idx, "OldBuf", - EntryBB->getTerminator()); - - // Copy the JBListHead to the alloca. - Value *OldBuf = new LoadInst(JBListHead, "oldjmpbufptr", true, - EntryBB->getTerminator()); - new StoreInst(OldBuf, OldJmpBufPtr, true, EntryBB->getTerminator()); - - // Add the new jumpbuf to the list. - new StoreInst(JmpBuf, JBListHead, true, EntryBB->getTerminator()); - - // Create the catch block. The catch block is basically a big switch - // statement that goes to all of the invoke catch blocks. - BasicBlock *CatchBB = - BasicBlock::Create(F.getContext(), "setjmp.catch", &F); - - // Create an alloca which keeps track of the stack pointer before every - // invoke, this allows us to properly restore the stack pointer after - // long jumping. - AllocaInst *StackPtr = new AllocaInst(Type::getInt8PtrTy(F.getContext()), 0, - "stackptr", EntryBB->begin()); - - // Create an alloca which keeps track of which invoke is currently - // executing. For normal calls it contains zero. - AllocaInst *InvokeNum = new AllocaInst(Type::getInt32Ty(F.getContext()), 0, - "invokenum",EntryBB->begin()); - new StoreInst(ConstantInt::get(Type::getInt32Ty(F.getContext()), 0), - InvokeNum, true, EntryBB->getTerminator()); - - // Insert a load in the Catch block, and a switch on its value. By default, - // we go to a block that just does an unwind (which is the correct action - // for a standard call). We insert an unreachable instruction here and - // modify the block to jump to the correct unwinding pad later. - BasicBlock *UnwindBB = BasicBlock::Create(F.getContext(), "unwindbb", &F); - UnreachablePlaceholder = new UnreachableInst(F.getContext(), UnwindBB); - - Value *CatchLoad = new LoadInst(InvokeNum, "invoke.num", true, CatchBB); - SwitchInst *CatchSwitch = - SwitchInst::Create(CatchLoad, UnwindBB, Invokes.size(), CatchBB); - - // Now that things are set up, insert the setjmp call itself. - - // Split the entry block to insert the conditional branch for the setjmp. - BasicBlock *ContBlock = EntryBB->splitBasicBlock(EntryBB->getTerminator(), - "setjmp.cont"); - - Idx[1] = ConstantInt::get(Type::getInt32Ty(F.getContext()), 0); - Value *JmpBufPtr = GetElementPtrInst::Create(JmpBuf, Idx, "TheJmpBuf", - EntryBB->getTerminator()); - JmpBufPtr = new BitCastInst(JmpBufPtr, - Type::getInt8PtrTy(F.getContext()), - "tmp", EntryBB->getTerminator()); - Value *SJRet = CallInst::Create(SetJmpFn, JmpBufPtr, "sjret", - EntryBB->getTerminator()); - - // Compare the return value to zero. - Value *IsNormal = new ICmpInst(EntryBB->getTerminator(), - ICmpInst::ICMP_EQ, SJRet, - Constant::getNullValue(SJRet->getType()), - "notunwind"); - // Nuke the uncond branch. - EntryBB->getTerminator()->eraseFromParent(); - - // Put in a new condbranch in its place. - BranchInst::Create(ContBlock, CatchBB, IsNormal, EntryBB); - - // At this point, we are all set up, rewrite each invoke instruction. - for (unsigned i = 0, e = Invokes.size(); i != e; ++i) - rewriteExpensiveInvoke(Invokes[i], i+1, InvokeNum, StackPtr, CatchSwitch); - } - - // We know that there is at least one unwind. - - // Create three new blocks, the block to load the jmpbuf ptr and compare - // against null, the block to do the longjmp, and the error block for if it - // is null. Add them at the end of the function because they are not hot. - BasicBlock *UnwindHandler = BasicBlock::Create(F.getContext(), - "dounwind", &F); - BasicBlock *UnwindBlock = BasicBlock::Create(F.getContext(), "unwind", &F); - BasicBlock *TermBlock = BasicBlock::Create(F.getContext(), "unwinderror", &F); - - // If this function contains an invoke, restore the old jumpbuf ptr. - Value *BufPtr; - if (OldJmpBufPtr) { - // Before the return, insert a copy from the saved value to the new value. - BufPtr = new LoadInst(OldJmpBufPtr, "oldjmpbufptr", UnwindHandler); - new StoreInst(BufPtr, JBListHead, UnwindHandler); - } else { - BufPtr = new LoadInst(JBListHead, "ehlist", UnwindHandler); - } - - // Load the JBList, if it's null, then there was no catch! - Value *NotNull = new ICmpInst(*UnwindHandler, ICmpInst::ICMP_NE, BufPtr, - Constant::getNullValue(BufPtr->getType()), - "notnull"); - BranchInst::Create(UnwindBlock, TermBlock, NotNull, UnwindHandler); - - // Create the block to do the longjmp. - // Get a pointer to the jmpbuf and longjmp. - Value *Idx[] = { Constant::getNullValue(Type::getInt32Ty(F.getContext())), - ConstantInt::get(Type::getInt32Ty(F.getContext()), 0) }; - Idx[0] = GetElementPtrInst::Create(BufPtr, Idx, "JmpBuf", UnwindBlock); - Idx[0] = new BitCastInst(Idx[0], - Type::getInt8PtrTy(F.getContext()), - "tmp", UnwindBlock); - Idx[1] = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1); - CallInst::Create(LongJmpFn, Idx, "", UnwindBlock); - new UnreachableInst(F.getContext(), UnwindBlock); - - // Set up the term block ("throw without a catch"). - new UnreachableInst(F.getContext(), TermBlock); - - // Insert a call to abort() - CallInst::Create(AbortFn, "", - TermBlock->getTerminator())->setTailCall(); - - // Replace the inserted unreachable with a branch to the unwind handler. - if (UnreachablePlaceholder) { - BranchInst::Create(UnwindHandler, UnreachablePlaceholder); - UnreachablePlaceholder->eraseFromParent(); - } - - // Finally, for any returns from this function, if this function contains an - // invoke, restore the old jmpbuf pointer to its input value. - if (OldJmpBufPtr) { - for (unsigned i = 0, e = Returns.size(); i != e; ++i) { - ReturnInst *R = Returns[i]; - - // Before the return, insert a copy from the saved value to the new value. - Value *OldBuf = new LoadInst(OldJmpBufPtr, "oldjmpbufptr", true, R); - new StoreInst(OldBuf, JBListHead, true, R); - } - } - - return true; -} - -bool LowerInvoke::runOnFunction(Function &F) { - if (useExpensiveEHSupport) - return insertExpensiveEHSupport(F); - else - return insertCheapEHSupport(F); -} diff --git a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp index 2d2a8a5..d6e5bb6 100644 --- a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp @@ -14,11 +14,13 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/ADT/STLExtras.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/CFG.h" #include "llvm/Pass.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" @@ -27,6 +29,8 @@ #include <algorithm> using namespace llvm; +#define DEBUG_TYPE "lower-switch" + namespace { /// LowerSwitch Pass - Replace all SwitchInst instructions with chained branch /// instructions. @@ -37,9 +41,9 @@ namespace { initializeLowerSwitchPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnFunction(Function &F); - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { // This is a cluster of orthogonal Transforms AU.addPreserved<UnifyFunctionExitNodes>(); AU.addPreserved("mem2reg"); @@ -51,20 +55,23 @@ namespace { Constant* High; BasicBlock* BB; - CaseRange(Constant *low = 0, Constant *high = 0, BasicBlock *bb = 0) : + CaseRange(Constant *low = nullptr, Constant *high = nullptr, + BasicBlock *bb = nullptr) : Low(low), High(high), BB(bb) { } }; - typedef std::vector<CaseRange> CaseVector; + typedef std::vector<CaseRange> CaseVector; typedef std::vector<CaseRange>::iterator CaseItr; private: void processSwitchInst(SwitchInst *SI); - BasicBlock* switchConvert(CaseItr Begin, CaseItr End, Value* Val, - BasicBlock* OrigBlock, BasicBlock* Default); - BasicBlock* newLeafBlock(CaseRange& Leaf, Value* Val, - BasicBlock* OrigBlock, BasicBlock* Default); - unsigned Clusterify(CaseVector& Cases, SwitchInst *SI); + BasicBlock *switchConvert(CaseItr Begin, CaseItr End, + ConstantInt *LowerBound, ConstantInt *UpperBound, + Value *Val, BasicBlock *Predecessor, + BasicBlock *OrigBlock, BasicBlock *Default); + BasicBlock *newLeafBlock(CaseRange &Leaf, Value *Val, BasicBlock *OrigBlock, + BasicBlock *Default); + unsigned Clusterify(CaseVector &Cases, SwitchInst *SI); }; /// The comparison function for sorting the switch case values in the vector. @@ -124,17 +131,45 @@ static raw_ostream& operator<<(raw_ostream &O, return O << "]"; } +static void fixPhis(BasicBlock *Succ, + BasicBlock *OrigBlock, + BasicBlock *NewNode) { + for (BasicBlock::iterator I = Succ->begin(), + E = Succ->getFirstNonPHI(); + I != E; ++I) { + PHINode *PN = cast<PHINode>(I); + + for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) { + if (PN->getIncomingBlock(I) == OrigBlock) + PN->setIncomingBlock(I, NewNode); + } + } +} + // switchConvert - Convert the switch statement into a binary lookup of // the case values. The function recursively builds this tree. -// -BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, - Value* Val, BasicBlock* OrigBlock, - BasicBlock* Default) -{ +// LowerBound and UpperBound are used to keep track of the bounds for Val +// that have already been checked by a block emitted by one of the previous +// calls to switchConvert in the call stack. +BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, + ConstantInt *LowerBound, + ConstantInt *UpperBound, Value *Val, + BasicBlock *Predecessor, + BasicBlock *OrigBlock, + BasicBlock *Default) { unsigned Size = End - Begin; - if (Size == 1) + if (Size == 1) { + // Check if the Case Range is perfectly squeezed in between + // already checked Upper and Lower bounds. If it is then we can avoid + // emitting the code that checks if the value actually falls in the range + // because the bounds already tell us so. + if (Begin->Low == LowerBound && Begin->High == UpperBound) { + fixPhis(Begin->BB, OrigBlock, Predecessor); + return Begin->BB; + } return newLeafBlock(*Begin, Val, OrigBlock, Default); + } unsigned Mid = Size / 2; std::vector<CaseRange> LHS(Begin, Begin + Mid); @@ -142,26 +177,65 @@ BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, std::vector<CaseRange> RHS(Begin + Mid, End); DEBUG(dbgs() << "RHS: " << RHS << "\n"); - CaseRange& Pivot = *(Begin + Mid); - DEBUG(dbgs() << "Pivot ==> " - << cast<ConstantInt>(Pivot.Low)->getValue() << " -" - << cast<ConstantInt>(Pivot.High)->getValue() << "\n"); + CaseRange &Pivot = *(Begin + Mid); + DEBUG(dbgs() << "Pivot ==> " + << cast<ConstantInt>(Pivot.Low)->getValue() + << " -" << cast<ConstantInt>(Pivot.High)->getValue() << "\n"); + + // NewLowerBound here should never be the integer minimal value. + // This is because it is computed from a case range that is never + // the smallest, so there is always a case range that has at least + // a smaller value. + ConstantInt *NewLowerBound = cast<ConstantInt>(Pivot.Low); + ConstantInt *NewUpperBound; + + // If we don't have a Default block then it means that we can never + // have a value outside of a case range, so set the UpperBound to the highest + // value in the LHS part of the case ranges. + if (Default != nullptr) { + // Because NewLowerBound is never the smallest representable integer + // it is safe here to subtract one. + NewUpperBound = ConstantInt::get(NewLowerBound->getContext(), + NewLowerBound->getValue() - 1); + } else { + CaseItr LastLHS = LHS.begin() + LHS.size() - 1; + NewUpperBound = cast<ConstantInt>(LastLHS->High); + } - BasicBlock* LBranch = switchConvert(LHS.begin(), LHS.end(), Val, - OrigBlock, Default); - BasicBlock* RBranch = switchConvert(RHS.begin(), RHS.end(), Val, - OrigBlock, Default); + DEBUG(dbgs() << "LHS Bounds ==> "; + if (LowerBound) { + dbgs() << cast<ConstantInt>(LowerBound)->getSExtValue(); + } else { + dbgs() << "NONE"; + } + dbgs() << " - " << NewUpperBound->getSExtValue() << "\n"; + dbgs() << "RHS Bounds ==> "; + dbgs() << NewLowerBound->getSExtValue() << " - "; + if (UpperBound) { + dbgs() << cast<ConstantInt>(UpperBound)->getSExtValue() << "\n"; + } else { + dbgs() << "NONE\n"; + }); // Create a new node that checks if the value is < pivot. Go to the // left branch if it is and right branch if not. Function* F = OrigBlock->getParent(); BasicBlock* NewNode = BasicBlock::Create(Val->getContext(), "NodeBlock"); - Function::iterator FI = OrigBlock; - F->getBasicBlockList().insert(++FI, NewNode); ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT, Val, Pivot.Low, "Pivot"); + + BasicBlock *LBranch = switchConvert(LHS.begin(), LHS.end(), LowerBound, + NewUpperBound, Val, NewNode, OrigBlock, + Default); + BasicBlock *RBranch = switchConvert(RHS.begin(), RHS.end(), NewLowerBound, + UpperBound, Val, NewNode, OrigBlock, + Default); + + Function::iterator FI = OrigBlock; + F->getBasicBlockList().insert(++FI, NewNode); NewNode->getInstList().push_back(Comp); + BranchInst::Create(LBranch, RBranch, Comp, NewNode); return NewNode; } @@ -182,7 +256,7 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val, F->getBasicBlockList().insert(++FI, NewLeaf); // Emit comparison - ICmpInst* Comp = NULL; + ICmpInst* Comp = nullptr; if (Leaf.Low == Leaf.High) { // Make the seteq instruction... Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_EQ, Val, @@ -245,7 +319,8 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) { // Merge case into clusters if (Cases.size()>=2) - for (CaseItr I=Cases.begin(), J=llvm::next(Cases.begin()); J!=Cases.end(); ) { + for (CaseItr I = Cases.begin(), J = std::next(Cases.begin()); + J != Cases.end();) { int64_t nextValue = cast<ConstantInt>(J->Low)->getSExtValue(); int64_t currentValue = cast<ConstantInt>(I->High)->getSExtValue(); BasicBlock* nextBB = J->BB; @@ -287,13 +362,19 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) { return; } + const bool DefaultIsUnreachable = + Default->size() == 1 && isa<UnreachableInst>(Default->getTerminator()); // Create a new, empty default block so that the new hierarchy of // if-then statements go to this and the PHI nodes are happy. - BasicBlock* NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault"); - F->getBasicBlockList().insert(Default, NewDefault); - - BranchInst::Create(Default, NewDefault); - + // if the default block is set as an unreachable we avoid creating one + // because will never be a valid target. + BasicBlock *NewDefault = nullptr; + if (!DefaultIsUnreachable) { + NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault"); + F->getBasicBlockList().insert(Default, NewDefault); + + BranchInst::Create(Default, NewDefault); + } // If there is an entry in any PHI nodes for the default edge, make sure // to update them as well. for (BasicBlock::iterator I = Default->begin(); isa<PHINode>(I); ++I) { @@ -312,12 +393,31 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) { DEBUG(dbgs() << "Cases: " << Cases << "\n"); (void)numCmps; - BasicBlock* SwitchBlock = switchConvert(Cases.begin(), Cases.end(), Val, - OrigBlock, NewDefault); + ConstantInt *UpperBound = nullptr; + ConstantInt *LowerBound = nullptr; + + // Optimize the condition where Default is an unreachable block. In this case + // we can make the bounds tightly fitted around the case value ranges, + // because we know that the value passed to the switch should always be + // exactly one of the case values. + if (DefaultIsUnreachable) { + CaseItr LastCase = Cases.begin() + Cases.size() - 1; + UpperBound = cast<ConstantInt>(LastCase->High); + LowerBound = cast<ConstantInt>(Cases.begin()->Low); + } + BasicBlock *SwitchBlock = + switchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val, + OrigBlock, OrigBlock, NewDefault); // Branch to our shiny new if-then stuff... BranchInst::Create(SwitchBlock, OrigBlock); // We are now done with the switch instruction, delete it. CurBlock->getInstList().erase(SI); + + pred_iterator PI = pred_begin(Default), E = pred_end(Default); + // If the Default block has no more predecessors just remove it + if (PI == E) { + DeleteDeadBlock(Default); + } } diff --git a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp index 61b3965..189caa7 100644 --- a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp +++ b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp @@ -12,16 +12,17 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "mem2reg" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" using namespace llvm; +#define DEBUG_TYPE "mem2reg" + STATISTIC(NumPromoted, "Number of alloca's promoted"); namespace { @@ -34,10 +35,10 @@ namespace { // runOnFunction - To run this pass, first we calculate the alloca // instructions that are safe for promotion, then we promote each one. // - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DominatorTree>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); // This is a cluster of orthogonal Transforms AU.addPreserved<UnifyFunctionExitNodes>(); @@ -50,7 +51,7 @@ namespace { char PromotePass::ID = 0; INITIALIZE_PASS_BEGIN(PromotePass, "mem2reg", "Promote Memory to Register", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(PromotePass, "mem2reg", "Promote Memory to Register", false, false) @@ -61,7 +62,7 @@ bool PromotePass::runOnFunction(Function &F) { bool Changed = false; - DominatorTree &DT = getAnalysis<DominatorTree>(); + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); while (1) { Allocas.clear(); diff --git a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp index c3704531..395a46b 100644 --- a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp +++ b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp @@ -48,11 +48,11 @@ namespace { initializeMetaRenamerPass(*PassRegistry::getPassRegistry()); } - void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); } - bool runOnModule(Module &M) { + bool runOnModule(Module &M) override { static const char *const metaNames[] = { // See http://en.wikipedia.org/wiki/Metasyntactic_variable "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge", diff --git a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp index ff6e6f9..d9dbbca 100644 --- a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -24,16 +24,16 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F, int Priority) { IRBuilder<> IRB(M.getContext()); FunctionType *FnTy = FunctionType::get(IRB.getVoidTy(), false); - StructType *Ty = StructType::get( - IRB.getInt32Ty(), PointerType::getUnqual(FnTy), NULL); - - Constant *RuntimeCtorInit = ConstantStruct::get( - Ty, IRB.getInt32(Priority), F, NULL); // Get the current set of static global constructors and add the new ctor // to the list. SmallVector<Constant *, 16> CurrentCtors; - if (GlobalVariable * GVCtor = M.getNamedGlobal(Array)) { + StructType *EltTy; + if (GlobalVariable *GVCtor = M.getNamedGlobal(Array)) { + // If there is a global_ctors array, use the existing struct type, which can + // have 2 or 3 fields. + ArrayType *ATy = cast<ArrayType>(GVCtor->getType()->getElementType()); + EltTy = cast<StructType>(ATy->getElementType()); if (Constant *Init = GVCtor->getInitializer()) { unsigned n = Init->getNumOperands(); CurrentCtors.reserve(n + 1); @@ -41,13 +41,26 @@ static void appendToGlobalArray(const char *Array, CurrentCtors.push_back(cast<Constant>(Init->getOperand(i))); } GVCtor->eraseFromParent(); + } else { + // Use a simple two-field struct if there isn't one already. + EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy), + nullptr); } + // Build a 2 or 3 field global_ctor entry. We don't take a comdat key. + Constant *CSVals[3]; + CSVals[0] = IRB.getInt32(Priority); + CSVals[1] = F; + // FIXME: Drop support for the two element form in LLVM 4.0. + if (EltTy->getNumElements() >= 3) + CSVals[2] = llvm::Constant::getNullValue(IRB.getInt8PtrTy()); + Constant *RuntimeCtorInit = + ConstantStruct::get(EltTy, makeArrayRef(CSVals, EltTy->getNumElements())); + CurrentCtors.push_back(RuntimeCtorInit); // Create a new initializer. - ArrayType *AT = ArrayType::get(RuntimeCtorInit->getType(), - CurrentCtors.size()); + ArrayType *AT = ArrayType::get(EltTy, CurrentCtors.size()); Constant *NewInit = ConstantArray::get(AT, CurrentCtors); // Create the new global variable and replace all uses of diff --git a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 8f6eee3..06d73fe 100644 --- a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -25,7 +25,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "mem2reg" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -34,23 +33,25 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/DIBuilder.h" -#include "llvm/DebugInfo.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DIBuilder.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Metadata.h" -#include "llvm/Support/CFG.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <queue> using namespace llvm; +#define DEBUG_TYPE "mem2reg" + STATISTIC(NumLocalPromoted, "Number of alloca's promoted within one block"); STATISTIC(NumSingleStore, "Number of alloca's promoted with a single store"); STATISTIC(NumDeadAlloca, "Number of dead alloca's removed"); @@ -59,11 +60,10 @@ STATISTIC(NumPHIInsert, "Number of PHI nodes inserted"); bool llvm::isAllocaPromotable(const AllocaInst *AI) { // FIXME: If the memory unit is of pointer or integer type, we can permit // assignments to subsections of the memory unit. + unsigned AS = AI->getType()->getAddressSpace(); // Only allow direct and non-volatile loads and stores... - for (Value::const_use_iterator UI = AI->use_begin(), UE = AI->use_end(); - UI != UE; ++UI) { // Loop over all of the uses of the alloca - const User *U = *UI; + for (const User *U : AI->users()) { if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { // Note that atomic loads can be transformed; atomic semantics do // not have any meaning for a local alloca. @@ -81,12 +81,12 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) { II->getIntrinsicID() != Intrinsic::lifetime_end) return false; } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { - if (BCI->getType() != Type::getInt8PtrTy(U->getContext())) + if (BCI->getType() != Type::getInt8PtrTy(U->getContext(), AS)) return false; if (!onlyUsedByLifetimeMarkers(BCI)) return false; } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) { - if (GEPI->getType() != Type::getInt8PtrTy(U->getContext())) + if (GEPI->getType() != Type::getInt8PtrTy(U->getContext(), AS)) return false; if (!GEPI->hasAllZeroIndices()) return false; @@ -116,11 +116,11 @@ struct AllocaInfo { void clear() { DefiningBlocks.clear(); UsingBlocks.clear(); - OnlyStore = 0; - OnlyBlock = 0; + OnlyStore = nullptr; + OnlyBlock = nullptr; OnlyUsedInOneBlock = true; - AllocaPointerVal = 0; - DbgDeclare = 0; + AllocaPointerVal = nullptr; + DbgDeclare = nullptr; } /// Scan the uses of the specified alloca, filling in the AllocaInfo used @@ -131,8 +131,7 @@ struct AllocaInfo { // As we scan the uses of the alloca instruction, keep track of stores, // and decide whether all of the loads and stores to the alloca are within // the same basic block. - for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); - UI != E;) { + for (auto UI = AI->user_begin(), E = AI->user_end(); UI != E;) { Instruction *User = cast<Instruction>(*UI++); if (StoreInst *SI = dyn_cast<StoreInst>(User)) { @@ -149,7 +148,7 @@ struct AllocaInfo { } if (OnlyUsedInOneBlock) { - if (OnlyBlock == 0) + if (!OnlyBlock) OnlyBlock = User->getParent(); else if (OnlyBlock != User->getParent()) OnlyUsedInOneBlock = false; @@ -165,7 +164,7 @@ class RenamePassData { public: typedef std::vector<Value *> ValVector; - RenamePassData() : BB(NULL), Pred(NULL), Values() {} + RenamePassData() : BB(nullptr), Pred(nullptr), Values() {} RenamePassData(BasicBlock *B, BasicBlock *P, const ValVector &V) : BB(B), Pred(P), Values(V) {} BasicBlock *BB; @@ -317,8 +316,7 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) { // Knowing that this alloca is promotable, we know that it's safe to kill all // instructions except for load and store. - for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); - UI != UE;) { + for (auto UI = AI->user_begin(), UE = AI->user_end(); UI != UE;) { Instruction *I = cast<Instruction>(*UI); ++UI; if (isa<LoadInst>(I) || isa<StoreInst>(I)) @@ -328,10 +326,9 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) { // The only users of this bitcast/GEP instruction are lifetime intrinsics. // Follow the use/def chain to erase them now instead of leaving it for // dead code elimination later. - for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); - UI != UE;) { - Instruction *Inst = cast<Instruction>(*UI); - ++UI; + for (auto UUI = I->user_begin(), UUE = I->user_end(); UUI != UUE;) { + Instruction *Inst = cast<Instruction>(*UUI); + ++UUI; Inst->eraseFromParent(); } } @@ -359,7 +356,7 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, // Clear out UsingBlocks. We will reconstruct it here if needed. Info.UsingBlocks.clear(); - for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) { + for (auto UI = AI->user_begin(), E = AI->user_end(); UI != E;) { Instruction *UserInst = cast<Instruction>(*UI++); if (!isa<LoadInst>(UserInst)) { assert(UserInst == OnlyStore && "Should only have load/stores"); @@ -456,9 +453,8 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, typedef SmallVector<std::pair<unsigned, StoreInst *>, 64> StoresByIndexTy; StoresByIndexTy StoresByIndex; - for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E; - ++UI) - if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) + for (User *U : AI->users()) + if (StoreInst *SI = dyn_cast<StoreInst>(U)) StoresByIndex.push_back(std::make_pair(LBI.getInstructionIndex(SI), SI)); // Sort the stores by their index, making it efficient to do a lookup with a @@ -467,7 +463,7 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, // Walk all of the loads from this alloca, replacing them with the nearest // store above them, if any. - for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) { + for (auto UI = AI->user_begin(), E = AI->user_end(); UI != E;) { LoadInst *LI = dyn_cast<LoadInst>(*UI++); if (!LI) continue; @@ -477,7 +473,8 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, // Find the nearest store that has a lower index than this load. StoresByIndexTy::iterator I = std::lower_bound(StoresByIndex.begin(), StoresByIndex.end(), - std::make_pair(LoadIdx, static_cast<StoreInst *>(0)), + std::make_pair(LoadIdx, + static_cast<StoreInst *>(nullptr)), less_first()); if (I == StoresByIndex.begin()) @@ -485,7 +482,7 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, LI->replaceAllUsesWith(UndefValue::get(LI->getType())); else // Otherwise, there was a store before this load, the load takes its value. - LI->replaceAllUsesWith(llvm::prior(I)->second->getOperand(0)); + LI->replaceAllUsesWith(std::prev(I)->second->getOperand(0)); if (AST && LI->getType()->isPointerTy()) AST->deleteValue(LI); @@ -495,7 +492,7 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, // Remove the (now dead) stores and alloca. while (!AI->use_empty()) { - StoreInst *SI = cast<StoreInst>(AI->use_back()); + StoreInst *SI = cast<StoreInst>(AI->user_back()); // Record debuginfo for the store before removing it. if (DbgDeclareInst *DDI = Info.DbgDeclare) { DIBuilder DIB(*AI->getParent()->getParent()->getParent()); @@ -638,7 +635,7 @@ void PromoteMem2Reg::run() { // and inserting the phi nodes we marked as necessary // std::vector<RenamePassData> RenamePassWorkList; - RenamePassWorkList.push_back(RenamePassData(F.begin(), 0, Values)); + RenamePassWorkList.push_back(RenamePassData(F.begin(), nullptr, Values)); do { RenamePassData RPD; RPD.swap(RenamePassWorkList.back()); @@ -679,8 +676,8 @@ void PromoteMem2Reg::run() { // Iterating over NewPhiNodes is deterministic, so it is safe to try to // simplify and RAUW them as we go. If it was not, we could add uses to - // the values we replace with in a non deterministic order, thus creating - // non deterministic def->use chains. + // the values we replace with in a non-deterministic order, thus creating + // non-deterministic def->use chains. for (DenseMap<std::pair<unsigned, unsigned>, PHINode *>::iterator I = NewPhiNodes.begin(), E = NewPhiNodes.end(); @@ -688,7 +685,7 @@ void PromoteMem2Reg::run() { PHINode *PN = I->second; // If this PHI node merges one value and/or undefs, get the value. - if (Value *V = SimplifyInstruction(PN, 0, 0, &DT)) { + if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, &DT)) { if (AST && PN->getType()->isPointerTy()) AST->deleteValue(PN); PN->replaceAllUsesWith(V); @@ -996,7 +993,7 @@ NextIteration: // Get the next phi node. ++PNI; APN = dyn_cast<PHINode>(PNI); - if (APN == 0) + if (!APN) break; // Verify that it is missing entries. If not, it is not being inserted diff --git a/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp index 30adbfa..3fcb789 100644 --- a/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -11,17 +11,14 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "ssaupdater" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/TinyPtrVector.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/Support/AlignOf.h" -#include "llvm/Support/Allocator.h" -#include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -30,20 +27,22 @@ using namespace llvm; +#define DEBUG_TYPE "ssaupdater" + typedef DenseMap<BasicBlock*, Value*> AvailableValsTy; static AvailableValsTy &getAvailableVals(void *AV) { return *static_cast<AvailableValsTy*>(AV); } SSAUpdater::SSAUpdater(SmallVectorImpl<PHINode*> *NewPHI) - : AV(0), ProtoType(0), ProtoName(), InsertedPHIs(NewPHI) {} + : AV(nullptr), ProtoType(nullptr), ProtoName(), InsertedPHIs(NewPHI) {} SSAUpdater::~SSAUpdater() { delete static_cast<AvailableValsTy*>(AV); } void SSAUpdater::Initialize(Type *Ty, StringRef Name) { - if (AV == 0) + if (!AV) AV = new AvailableValsTy(); else getAvailableVals(AV).clear(); @@ -56,7 +55,7 @@ bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const { } void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) { - assert(ProtoType != 0 && "Need to initialize SSAUpdater"); + assert(ProtoType && "Need to initialize SSAUpdater"); assert(ProtoType == V->getType() && "All rewritten values must have the same type"); getAvailableVals(AV)[BB] = V; @@ -92,7 +91,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { // Otherwise, we have the hard case. Get the live-in values for each // predecessor. SmallVector<std::pair<BasicBlock*, Value*>, 8> PredValues; - Value *SingularValue = 0; + Value *SingularValue = nullptr; // We can get our predecessor info by walking the pred_iterator list, but it // is relatively slow. If we already have PHI nodes in this block, walk one @@ -107,7 +106,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { if (i == 0) SingularValue = PredVal; else if (PredVal != SingularValue) - SingularValue = 0; + SingularValue = nullptr; } } else { bool isFirstPred = true; @@ -121,7 +120,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { SingularValue = PredVal; isFirstPred = false; } else if (PredVal != SingularValue) - SingularValue = 0; + SingularValue = nullptr; } } @@ -130,7 +129,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { return UndefValue::get(ProtoType); // Otherwise, if all the merged values are the same, just use it. - if (SingularValue != 0) + if (SingularValue) return SingularValue; // Otherwise, we do need a PHI: check to see if we already have one available @@ -293,7 +292,7 @@ public: PHINode *PHI = ValueIsPHI(Val, Updater); if (PHI && PHI->getNumIncomingValues() == 0) return PHI; - return 0; + return nullptr; } /// GetPHIValue - For the specified PHI instruction, return the value @@ -403,7 +402,7 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { // the order of these instructions in the block. If the first use in the // block is a load, then it uses the live in value. The last store defines // the live out value. We handle this by doing a linear scan of the block. - Value *StoredValue = 0; + Value *StoredValue = nullptr; for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) { if (LoadInst *L = dyn_cast<LoadInst>(II)) { // If this is a load from an unrelated pointer, ignore it. diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index ff50b12..24bb63b 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -11,7 +11,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "simplifycfg" #include "llvm/Transforms/Utils/Local.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" @@ -23,6 +22,8 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" @@ -34,14 +35,12 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/NoFolder.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" -#include "llvm/Support/CFG.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/ConstantRange.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/NoFolder.h" -#include "llvm/Support/PatternMatch.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include <algorithm> @@ -50,6 +49,8 @@ using namespace llvm; using namespace PatternMatch; +#define DEBUG_TYPE "simplifycfg" + static cl::opt<unsigned> PHINodeFoldingThreshold("phi-node-folding-threshold", cl::Hidden, cl::init(1), cl::desc("Control the amount of phi node folding to perform (default = 1)")); @@ -62,12 +63,13 @@ static cl::opt<bool> SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true), cl::desc("Sink common instructions down to the end block")); -static cl::opt<bool> -HoistCondStores("simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true), - cl::desc("Hoist conditional stores if an unconditional store preceeds")); +static cl::opt<bool> HoistCondStores( + "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true), + cl::desc("Hoist conditional stores if an unconditional store precedes")); STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables"); +STATISTIC(NumLookupTablesHoles, "Number of switch instructions turned into lookup tables (holes checked)"); STATISTIC(NumSinkCommons, "Number of common instructions sunk down to the end block"); STATISTIC(NumSpeculations, "Number of speculative executed instructions"); @@ -90,7 +92,7 @@ namespace { class SimplifyCFGOpt { const TargetTransformInfo &TTI; - const DataLayout *const TD; + const DataLayout *const DL; Value *isValueEqualityComparison(TerminatorInst *TI); BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases); @@ -109,8 +111,8 @@ class SimplifyCFGOpt { bool SimplifyCondBranch(BranchInst *BI, IRBuilder <>&Builder); public: - SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout *TD) - : TTI(TTI), TD(TD) {} + SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout *DL) + : TTI(TTI), DL(DL) {} bool run(BasicBlock *BB); }; } @@ -199,8 +201,8 @@ static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred, /// ComputeSpeculationCost - Compute an abstract "cost" of speculating the /// given instruction, which is assumed to be safe to speculate. 1 means /// cheap, 2 means less cheap, and UINT_MAX means prohibitively expensive. -static unsigned ComputeSpeculationCost(const User *I) { - assert(isSafeToSpeculativelyExecute(I) && +static unsigned ComputeSpeculationCost(const User *I, const DataLayout *DL) { + assert(isSafeToSpeculativelyExecute(I, DL) && "Instruction is not safe to speculatively execute!"); switch (Operator::getOpcode(I)) { default: @@ -211,6 +213,7 @@ static unsigned ComputeSpeculationCost(const User *I) { if (!cast<GEPOperator>(I)->hasAllConstantIndices()) return UINT_MAX; return 1; + case Instruction::ExtractValue: case Instruction::Load: case Instruction::Add: case Instruction::Sub: @@ -224,6 +227,9 @@ static unsigned ComputeSpeculationCost(const User *I) { case Instruction::Trunc: case Instruction::ZExt: case Instruction::SExt: + case Instruction::BitCast: + case Instruction::ExtractElement: + case Instruction::InsertElement: return 1; // These are all cheap. case Instruction::Call: @@ -251,7 +257,8 @@ static unsigned ComputeSpeculationCost(const User *I) { /// CostRemaining, false is returned and CostRemaining is undefined. static bool DominatesMergePoint(Value *V, BasicBlock *BB, SmallPtrSet<Instruction*, 4> *AggressiveInsts, - unsigned &CostRemaining) { + unsigned &CostRemaining, + const DataLayout *DL) { Instruction *I = dyn_cast<Instruction>(V); if (!I) { // Non-instructions all dominate instructions, but not all constantexprs @@ -271,12 +278,12 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, // branch to BB, then it must be in the 'conditional' part of the "if // statement". If not, it definitely dominates the region. BranchInst *BI = dyn_cast<BranchInst>(PBB->getTerminator()); - if (BI == 0 || BI->isConditional() || BI->getSuccessor(0) != BB) + if (!BI || BI->isConditional() || BI->getSuccessor(0) != BB) return true; // If we aren't allowing aggressive promotion anymore, then don't consider // instructions in the 'if region'. - if (AggressiveInsts == 0) return false; + if (!AggressiveInsts) return false; // If we have seen this instruction before, don't count it again. if (AggressiveInsts->count(I)) return true; @@ -284,10 +291,10 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, // Okay, it looks like the instruction IS in the "condition". Check to // see if it's a cheap instruction to unconditionally compute, and if it // only uses stuff defined outside of the condition. If so, hoist it out. - if (!isSafeToSpeculativelyExecute(I)) + if (!isSafeToSpeculativelyExecute(I, DL)) return false; - unsigned Cost = ComputeSpeculationCost(I); + unsigned Cost = ComputeSpeculationCost(I, DL); if (Cost > CostRemaining) return false; @@ -297,7 +304,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, // Okay, we can only really hoist these out if their operands do // not take us over the cost threshold. for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) - if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining)) + if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, DL)) return false; // Okay, it's safe to do this! Remember this instruction. AggressiveInsts->insert(I); @@ -306,15 +313,15 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, /// GetConstantInt - Extract ConstantInt from value, looking through IntToPtr /// and PointerNullValue. Return NULL if value is not a constant int. -static ConstantInt *GetConstantInt(Value *V, const DataLayout *TD) { +static ConstantInt *GetConstantInt(Value *V, const DataLayout *DL) { // Normal constant int. ConstantInt *CI = dyn_cast<ConstantInt>(V); - if (CI || !TD || !isa<Constant>(V) || !V->getType()->isPointerTy()) + if (CI || !DL || !isa<Constant>(V) || !V->getType()->isPointerTy()) return CI; // This is some kind of pointer constant. Turn it into a pointer-sized // ConstantInt if possible. - IntegerType *PtrTy = cast<IntegerType>(TD->getIntPtrType(V->getType())); + IntegerType *PtrTy = cast<IntegerType>(DL->getIntPtrType(V->getType())); // Null pointer means 0, see SelectionDAGBuilder::getValue(const Value*). if (isa<ConstantPointerNull>(V)) @@ -331,7 +338,7 @@ static ConstantInt *GetConstantInt(Value *V, const DataLayout *TD) { return cast<ConstantInt> (ConstantExpr::getIntegerCast(CI, PtrTy, /*isSigned=*/false)); } - return 0; + return nullptr; } /// GatherConstantCompares - Given a potentially 'or'd or 'and'd together @@ -340,13 +347,13 @@ static ConstantInt *GetConstantInt(Value *V, const DataLayout *TD) { /// Values vector. static Value * GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra, - const DataLayout *TD, bool isEQ, unsigned &UsedICmps) { + const DataLayout *DL, bool isEQ, unsigned &UsedICmps) { Instruction *I = dyn_cast<Instruction>(V); - if (I == 0) return 0; + if (!I) return nullptr; // If this is an icmp against a constant, handle this as one of the cases. if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) { - if (ConstantInt *C = GetConstantInt(I->getOperand(1), TD)) { + if (ConstantInt *C = GetConstantInt(I->getOperand(1), DL)) { Value *RHSVal; ConstantInt *RHSC; @@ -389,27 +396,27 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra, // If there are a ton of values, we don't want to make a ginormous switch. if (Span.getSetSize().ugt(8) || Span.isEmptySet()) - return 0; + return nullptr; for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp) Vals.push_back(ConstantInt::get(V->getContext(), Tmp)); UsedICmps++; return hasAdd ? RHSVal : I->getOperand(0); } - return 0; + return nullptr; } // Otherwise, we can only handle an | or &, depending on isEQ. if (I->getOpcode() != (isEQ ? Instruction::Or : Instruction::And)) - return 0; + return nullptr; unsigned NumValsBeforeLHS = Vals.size(); unsigned UsedICmpsBeforeLHS = UsedICmps; - if (Value *LHS = GatherConstantCompares(I->getOperand(0), Vals, Extra, TD, + if (Value *LHS = GatherConstantCompares(I->getOperand(0), Vals, Extra, DL, isEQ, UsedICmps)) { unsigned NumVals = Vals.size(); unsigned UsedICmpsBeforeRHS = UsedICmps; - if (Value *RHS = GatherConstantCompares(I->getOperand(1), Vals, Extra, TD, + if (Value *RHS = GatherConstantCompares(I->getOperand(1), Vals, Extra, DL, isEQ, UsedICmps)) { if (LHS == RHS) return LHS; @@ -419,33 +426,33 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra, // The RHS of the or/and can't be folded in and we haven't used "Extra" yet, // set it and return success. - if (Extra == 0 || Extra == I->getOperand(1)) { + if (Extra == nullptr || Extra == I->getOperand(1)) { Extra = I->getOperand(1); return LHS; } Vals.resize(NumValsBeforeLHS); UsedICmps = UsedICmpsBeforeLHS; - return 0; + return nullptr; } // If the LHS can't be folded in, but Extra is available and RHS can, try to // use LHS as Extra. - if (Extra == 0 || Extra == I->getOperand(0)) { + if (Extra == nullptr || Extra == I->getOperand(0)) { Value *OldExtra = Extra; Extra = I->getOperand(0); - if (Value *RHS = GatherConstantCompares(I->getOperand(1), Vals, Extra, TD, + if (Value *RHS = GatherConstantCompares(I->getOperand(1), Vals, Extra, DL, isEQ, UsedICmps)) return RHS; assert(Vals.size() == NumValsBeforeLHS); Extra = OldExtra; } - return 0; + return nullptr; } static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) { - Instruction *Cond = 0; + Instruction *Cond = nullptr; if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { Cond = dyn_cast<Instruction>(SI->getCondition()); } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { @@ -462,7 +469,7 @@ static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) { /// isValueEqualityComparison - Return true if the specified terminator checks /// to see if a value is equal to constant integer value. Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) { - Value *CV = 0; + Value *CV = nullptr; if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { // Do not permit merging of large switch instructions into their // predecessors unless there is only one predecessor. @@ -472,14 +479,14 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) { } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) if (BI->isConditional() && BI->getCondition()->hasOneUse()) if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) - if (ICI->isEquality() && GetConstantInt(ICI->getOperand(1), TD)) + if (ICI->isEquality() && GetConstantInt(ICI->getOperand(1), DL)) CV = ICI->getOperand(0); // Unwrap any lossless ptrtoint cast. - if (TD && CV) { + if (DL && CV) { if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV)) { Value *Ptr = PTII->getPointerOperand(); - if (PTII->getType() == TD->getIntPtrType(Ptr->getType())) + if (PTII->getType() == DL->getIntPtrType(Ptr->getType())) CV = Ptr; } } @@ -504,7 +511,7 @@ GetValueEqualityComparisonCases(TerminatorInst *TI, ICmpInst *ICI = cast<ICmpInst>(BI->getCondition()); BasicBlock *Succ = BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_NE); Cases.push_back(ValueEqualityComparisonCase(GetConstantInt(ICI->getOperand(1), - TD), + DL), Succ)); return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ); } @@ -652,11 +659,11 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, // Otherwise, TI's block must correspond to some matched value. Find out // which value (or set of values) this is. - ConstantInt *TIV = 0; + ConstantInt *TIV = nullptr; BasicBlock *TIBB = TI->getParent(); for (unsigned i = 0, e = PredCases.size(); i != e; ++i) if (PredCases[i].Dest == TIBB) { - if (TIV != 0) + if (TIV) return false; // Cannot handle multiple values coming to this block. TIV = PredCases[i].Value; } @@ -664,7 +671,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, // Okay, we found the one constant that our value can be if we get into TI's // BB. Find out which successor will unconditionally be branched to. - BasicBlock *TheRealDest = 0; + BasicBlock *TheRealDest = nullptr; for (unsigned i = 0, e = ThisCases.size(); i != e; ++i) if (ThisCases[i].Value == TIV) { TheRealDest = ThisCases[i].Dest; @@ -672,7 +679,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, } // If not handled by any explicit cases, it is handled by the default case. - if (TheRealDest == 0) TheRealDest = ThisDef; + if (!TheRealDest) TheRealDest = ThisDef; // Remove PHI node entries for dead edges. BasicBlock *CheckEdge = TheRealDest; @@ -680,7 +687,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, if (*SI != CheckEdge) (*SI)->removePredecessor(TIBB); else - CheckEdge = 0; + CheckEdge = nullptr; // Insert the new branch. Instruction *NI = Builder.CreateBr(TheRealDest); @@ -732,8 +739,7 @@ static void GetBranchWeights(TerminatorInst *TI, MDNode* MD = TI->getMetadata(LLVMContext::MD_prof); assert(MD); for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) { - ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(i)); - assert(CI); + ConstantInt *CI = cast<ConstantInt>(MD->getOperand(i)); Weights.push_back(CI->getValue().getZExtValue()); } @@ -748,21 +754,14 @@ static void GetBranchWeights(TerminatorInst *TI, } } -/// Sees if any of the weights are too big for a uint32_t, and halves all the -/// weights if any are. +/// Keep halving the weights until all can fit in uint32_t. static void FitWeights(MutableArrayRef<uint64_t> Weights) { - bool Halve = false; - for (unsigned i = 0; i < Weights.size(); ++i) - if (Weights[i] > UINT_MAX) { - Halve = true; - break; - } - - if (! Halve) - return; - - for (unsigned i = 0; i < Weights.size(); ++i) - Weights[i] /= 2; + uint64_t Max = *std::max_element(Weights.begin(), Weights.end()); + if (Max > UINT_MAX) { + unsigned Offset = 32 - countLeadingZeros(Max); + for (uint64_t &I : Weights) + I >>= Offset; + } } /// FoldValueComparisonIntoPredecessors - The specified terminator is a value @@ -929,8 +928,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, Builder.SetInsertPoint(PTI); // Convert pointer to int before we switch. if (CV->getType()->isPointerTy()) { - assert(TD && "Cannot switch on pointer without DataLayout"); - CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getType()), + assert(DL && "Cannot switch on pointer without DataLayout"); + CV = Builder.CreatePtrToInt(CV, DL->getIntPtrType(CV->getType()), "magicptr"); } @@ -957,10 +956,10 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, // Okay, last check. If BB is still a successor of PSI, then we must // have an infinite loop case. If so, add an infinitely looping block // to handle the case to preserve the behavior of the code. - BasicBlock *InfLoopBlock = 0; + BasicBlock *InfLoopBlock = nullptr; for (unsigned i = 0, e = NewSI->getNumSuccessors(); i != e; ++i) if (NewSI->getSuccessor(i) == BB) { - if (InfLoopBlock == 0) { + if (!InfLoopBlock) { // Insert it at the end of the function, because it's either code, // or it won't matter if it's hot. :) InfLoopBlock = BasicBlock::Create(BB->getContext(), @@ -999,7 +998,7 @@ static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2, /// HoistThenElseCodeToIf - Given a conditional branch that goes to BB1 and /// BB2, hoist any common code in the two blocks up into the branch block. The /// caller of this function guarantees that BI's block dominates BB1 and BB2. -static bool HoistThenElseCodeToIf(BranchInst *BI) { +static bool HoistThenElseCodeToIf(BranchInst *BI, const DataLayout *DL) { // This does very trivial matching, with limited scanning, to find identical // instructions in the two blocks. In particular, we don't want to get into // O(M*N) situations here where M and N are the sizes of BB1 and BB2. As @@ -1073,9 +1072,9 @@ HoistTerminator: if (BB1V == BB2V) continue; - if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V)) + if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V, DL)) return Changed; - if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V)) + if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V, DL)) return Changed; } } @@ -1106,7 +1105,7 @@ HoistTerminator: // These values do not agree. Insert a select instruction before NT // that determines the right value. SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)]; - if (SI == 0) + if (!SI) SI = cast<SelectInst> (Builder.CreateSelect(BI->getCondition(), BB1V, BB2V, BB1V->getName()+"."+BB2V->getName())); @@ -1151,7 +1150,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { // Gather the PHI nodes in BBEnd. std::map<Value*, std::pair<Value*, PHINode*> > MapValueFromBB1ToBB2; - Instruction *FirstNonPhiInBBEnd = 0; + Instruction *FirstNonPhiInBBEnd = nullptr; for (BasicBlock::iterator I = BBEnd->begin(), E = BBEnd->end(); I != E; ++I) { if (PHINode *PN = dyn_cast<PHINode>(I)) { @@ -1229,7 +1228,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { // The operands should be either the same or they need to be generated // with a PHI node after sinking. We only handle the case where there is // a single pair of different operands. - Value *DifferentOp1 = 0, *DifferentOp2 = 0; + Value *DifferentOp1 = nullptr, *DifferentOp2 = nullptr; unsigned Op1Idx = 0; for (unsigned I = 0, E = I1->getNumOperands(); I != E; ++I) { if (I1->getOperand(I) == I2->getOperand(I)) @@ -1325,11 +1324,11 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB, BasicBlock *StoreBB, BasicBlock *EndBB) { StoreInst *StoreToHoist = dyn_cast<StoreInst>(I); if (!StoreToHoist) - return 0; + return nullptr; // Volatile or atomic. if (!StoreToHoist->isSimple()) - return 0; + return nullptr; Value *StorePtr = StoreToHoist->getPointerOperand(); @@ -1341,7 +1340,7 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB, // Could be calling an instruction that effects memory like free(). if (CurI->mayHaveSideEffects() && !isa<StoreInst>(CurI)) - return 0; + return nullptr; StoreInst *SI = dyn_cast<StoreInst>(CurI); // Found the previous store make sure it stores to the same location. @@ -1349,10 +1348,10 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB, // Found the previous store, return its value operand. return SI->getValueOperand(); else if (SI) - return 0; // Unknown store. + return nullptr; // Unknown store. } - return 0; + return nullptr; } /// \brief Speculate a conditional basic block flattening the CFG. @@ -1392,7 +1391,8 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB, /// \endcode /// /// \returns true if the conditional block is removed. -static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) { +static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, + const DataLayout *DL) { // Be conservative for now. FP select instruction can often be expensive. Value *BrCond = BI->getCondition(); if (isa<FCmpInst>(BrCond)) @@ -1418,10 +1418,10 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) { SmallDenseMap<Instruction *, unsigned, 4> SinkCandidateUseCounts; unsigned SpeculationCost = 0; - Value *SpeculatedStoreValue = 0; - StoreInst *SpeculatedStore = 0; + Value *SpeculatedStoreValue = nullptr; + StoreInst *SpeculatedStore = nullptr; for (BasicBlock::iterator BBI = ThenBB->begin(), - BBE = llvm::prior(ThenBB->end()); + BBE = std::prev(ThenBB->end()); BBI != BBE; ++BBI) { Instruction *I = BBI; // Skip debug info. @@ -1435,13 +1435,13 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) { return false; // Don't hoist the instruction if it's unsafe or expensive. - if (!isSafeToSpeculativelyExecute(I) && + if (!isSafeToSpeculativelyExecute(I, DL) && !(HoistCondStores && (SpeculatedStoreValue = isSafeToSpeculateStore(I, BB, ThenBB, EndBB)))) return false; if (!SpeculatedStoreValue && - ComputeSpeculationCost(I) > PHINodeFoldingThreshold) + ComputeSpeculationCost(I, DL) > PHINodeFoldingThreshold) return false; // Store the store speculation candidate. @@ -1492,11 +1492,11 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) { if (!OrigCE && !ThenCE) continue; // Known safe and cheap. - if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE)) || - (OrigCE && !isSafeToSpeculativelyExecute(OrigCE))) + if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE, DL)) || + (OrigCE && !isSafeToSpeculativelyExecute(OrigCE, DL))) return false; - unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE) : 0; - unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE) : 0; + unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE, DL) : 0; + unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE, DL) : 0; if (OrigCost + ThenCost > 2 * PHINodeFoldingThreshold) return false; @@ -1531,7 +1531,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) { // Hoist the instructions. BB->getInstList().splice(BI, ThenBB->getInstList(), ThenBB->begin(), - llvm::prior(ThenBB->end())); + std::prev(ThenBB->end())); // Insert selects and rewrite the PHI operands. IRBuilder<true, NoFolder> Builder(BI); @@ -1589,10 +1589,9 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) { // We can only support instructions that do not define values that are // live outside of the current basic block. - for (Value::use_iterator UI = BBI->use_begin(), E = BBI->use_end(); - UI != E; ++UI) { - Instruction *U = cast<Instruction>(*UI); - if (U->getParent() != BB || isa<PHINode>(U)) return false; + for (User *U : BBI->users()) { + Instruction *UI = cast<Instruction>(U); + if (UI->getParent() != BB || isa<PHINode>(UI)) return false; } // Looks ok, continue checking. @@ -1605,7 +1604,7 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) { /// that is defined in the same block as the branch and if any PHI entries are /// constants, thread edges corresponding to that entry to be branches to their /// ultimate destination. -static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *TD) { +static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *DL) { BasicBlock *BB = BI->getParent(); PHINode *PN = dyn_cast<PHINode>(BI->getCondition()); // NOTE: we currently cannot transform this case if the PHI node is used @@ -1628,7 +1627,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *TD) { // constants. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { ConstantInt *CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i)); - if (CB == 0 || !CB->getType()->isIntegerTy(1)) continue; + if (!CB || !CB->getType()->isIntegerTy(1)) continue; // Okay, we now know that all edges from PredBB should be revectored to // branch to RealDest. @@ -1674,7 +1673,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *TD) { } // Check for trivial simplification. - if (Value *V = SimplifyInstruction(N, TD)) { + if (Value *V = SimplifyInstruction(N, DL)) { TranslateMap[BBI] = V; delete N; // Instruction folded away, don't need actual inst } else { @@ -1695,7 +1694,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *TD) { } // Recurse, simplifying any other constants. - return FoldCondBranchOnPHI(BI, TD) | true; + return FoldCondBranchOnPHI(BI, DL) | true; } return false; @@ -1703,7 +1702,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *TD) { /// FoldTwoEntryPHINode - Given a BB that starts with the specified two-entry /// PHI node, see if we can eliminate it. -static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *TD) { +static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL) { // Ok, this is a two entry PHI node. Check to see if this is a simple "if // statement", which has a very simple dominance structure. Basically, we // are trying to find the condition that is being branched on, which @@ -1737,23 +1736,23 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *TD) { for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) { PHINode *PN = cast<PHINode>(II++); - if (Value *V = SimplifyInstruction(PN, TD)) { + if (Value *V = SimplifyInstruction(PN, DL)) { PN->replaceAllUsesWith(V); PN->eraseFromParent(); continue; } if (!DominatesMergePoint(PN->getIncomingValue(0), BB, &AggressiveInsts, - MaxCostVal0) || + MaxCostVal0, DL) || !DominatesMergePoint(PN->getIncomingValue(1), BB, &AggressiveInsts, - MaxCostVal1)) + MaxCostVal1, DL)) return false; } // If we folded the first phi, PN dangles at this point. Refresh it. If // we ran out of PHIs then we simplified them all. PN = dyn_cast<PHINode>(BB->begin()); - if (PN == 0) return true; + if (!PN) return true; // Don't fold i1 branches on PHIs which contain binary operators. These can // often be turned into switches and other things. @@ -1767,11 +1766,11 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *TD) { // instructions in the predecessor blocks can be promoted as well. If // not, we won't be able to get rid of the control flow, so it's not // worth promoting to select instructions. - BasicBlock *DomBlock = 0; + BasicBlock *DomBlock = nullptr; BasicBlock *IfBlock1 = PN->getIncomingBlock(0); BasicBlock *IfBlock2 = PN->getIncomingBlock(1); if (cast<BranchInst>(IfBlock1->getTerminator())->isConditional()) { - IfBlock1 = 0; + IfBlock1 = nullptr; } else { DomBlock = *pred_begin(IfBlock1); for (BasicBlock::iterator I = IfBlock1->begin();!isa<TerminatorInst>(I);++I) @@ -1784,7 +1783,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *TD) { } if (cast<BranchInst>(IfBlock2->getTerminator())->isConditional()) { - IfBlock2 = 0; + IfBlock2 = nullptr; } else { DomBlock = *pred_begin(IfBlock2); for (BasicBlock::iterator I = IfBlock2->begin();!isa<TerminatorInst>(I);++I) @@ -1964,10 +1963,10 @@ static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) { /// FoldBranchToCommonDest - If this basic block is simple enough, and if a /// predecessor branches to us and one of our successors, fold the block into /// the predecessor and use logical operations to pick the right destination. -bool llvm::FoldBranchToCommonDest(BranchInst *BI) { +bool llvm::FoldBranchToCommonDest(BranchInst *BI, const DataLayout *DL) { BasicBlock *BB = BI->getParent(); - Instruction *Cond = 0; + Instruction *Cond = nullptr; if (BI->isConditional()) Cond = dyn_cast<Instruction>(BI->getCondition()); else { @@ -1993,12 +1992,12 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { } } - if (Cond == 0) + if (!Cond) return false; } - if (Cond == 0 || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) || - Cond->getParent() != BB || !Cond->hasOneUse()) + if (!Cond || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) || + Cond->getParent() != BB || !Cond->hasOneUse()) return false; // Only allow this if the condition is a simple instruction that can be @@ -2013,10 +2012,10 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { // that feeds the branch. We later ensure that any values that _it_ uses // were also live in the predecessor, so that we don't unnecessarily create // register pressure or inhibit out-of-order execution. - Instruction *BonusInst = 0; + Instruction *BonusInst = nullptr; if (&*FrontIt != Cond && - FrontIt->hasOneUse() && *FrontIt->use_begin() == Cond && - isSafeToSpeculativelyExecute(FrontIt)) { + FrontIt->hasOneUse() && FrontIt->user_back() == Cond && + isSafeToSpeculativelyExecute(FrontIt, DL)) { BonusInst = &*FrontIt; ++FrontIt; @@ -2031,7 +2030,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { // Make sure the instruction after the condition is the cond branch. BasicBlock::iterator CondIt = Cond; ++CondIt; - // Ingore dbg intrinsics. + // Ignore dbg intrinsics. while (isa<DbgInfoIntrinsic>(CondIt)) ++CondIt; if (&*CondIt != BI) @@ -2048,7 +2047,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { // Finally, don't infinitely unroll conditional loops. BasicBlock *TrueDest = BI->getSuccessor(0); - BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : 0; + BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : nullptr; if (TrueDest == BB || FalseDest == BB) return false; @@ -2060,7 +2059,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { // the common successor, verify that the same value flows in from both // blocks. SmallVector<PHINode*, 4> PHIs; - if (PBI == 0 || PBI->isUnconditional() || + if (!PBI || PBI->isUnconditional() || (BI->isConditional() && !SafeToMergeTerminators(BI, PBI)) || (!BI->isConditional() && @@ -2094,7 +2093,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { // instructions that are used by the terminator's condition because it // exposes more merging opportunities. bool UsedByBranch = (BonusInst && BonusInst->hasOneUse() && - *BonusInst->use_begin() == Cond); + BonusInst->user_back() == Cond); if (BonusInst && !UsedByBranch) { // Collect the values used by the bonus inst @@ -2150,9 +2149,17 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { } // If we have a bonus inst, clone it into the predecessor block. - Instruction *NewBonus = 0; + Instruction *NewBonus = nullptr; if (BonusInst) { NewBonus = BonusInst->clone(); + + // If we moved a load, we cannot any longer claim any knowledge about + // its potential value. The previous information might have been valid + // only given the branch precondition. + // For an analogous reason, we must also drop all the metadata whose + // semantics we don't understand. + NewBonus->dropUnknownMetadata(LLVMContext::MD_dbg); + PredBlock->getInstList().insert(PBI, NewBonus); NewBonus->takeName(BonusInst); BonusInst->setName(BonusInst->getName()+".old"); @@ -2218,14 +2225,14 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { MDBuilder(BI->getContext()). createBranchWeights(MDWeights)); } else - PBI->setMetadata(LLVMContext::MD_prof, NULL); + PBI->setMetadata(LLVMContext::MD_prof, nullptr); } else { // Update PHI nodes in the common successors. for (unsigned i = 0, e = PHIs.size(); i != e; ++i) { ConstantInt *PBI_C = cast<ConstantInt>( PHIs[i]->getIncomingValueForBlock(PBI->getParent())); assert(PBI_C->getType()->isIntegerTy(1)); - Instruction *MergedCond = 0; + Instruction *MergedCond = nullptr; if (PBI->getSuccessor(0) == TrueDest) { // Create (PBI_Cond and PBI_C) or (!PBI_Cond and BI_Value) // PBI_C is true: PBI_Cond or (!PBI_Cond and BI_Value) @@ -2338,7 +2345,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { } // If this is a conditional branch in an empty block, and if any - // predecessors is a conditional branch to one of our destinations, + // predecessors are a conditional branch to one of our destinations, // fold the conditions into logical ops and one cond br. BasicBlock::iterator BBI = BB->begin(); // Ignore dbg intrinsics. @@ -2373,16 +2380,33 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { // Do not perform this transformation if it would require // insertion of a large number of select instructions. For targets // without predication/cmovs, this is a big pessimization. - BasicBlock *CommonDest = PBI->getSuccessor(PBIOp); + // Also do not perform this transformation if any phi node in the common + // destination block can trap when reached by BB or PBB (PR17073). In that + // case, it would be unsafe to hoist the operation into a select instruction. + + BasicBlock *CommonDest = PBI->getSuccessor(PBIOp); unsigned NumPhis = 0; for (BasicBlock::iterator II = CommonDest->begin(); - isa<PHINode>(II); ++II, ++NumPhis) + isa<PHINode>(II); ++II, ++NumPhis) { if (NumPhis > 2) // Disable this xform. return false; + PHINode *PN = cast<PHINode>(II); + Value *BIV = PN->getIncomingValueForBlock(BB); + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BIV)) + if (CE->canTrap()) + return false; + + unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent()); + Value *PBIV = PN->getIncomingValue(PBBIdx); + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(PBIV)) + if (CE->canTrap()) + return false; + } + // Finally, if everything is ok, fold the branches to logical ops. - BasicBlock *OtherDest = BI->getSuccessor(BIOp ^ 1); + BasicBlock *OtherDest = BI->getSuccessor(BIOp ^ 1); DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent() << "AND: " << *BI->getParent()); @@ -2498,16 +2522,16 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond, // If TrueBB and FalseBB are equal, only try to preserve one copy of that // successor. BasicBlock *KeepEdge1 = TrueBB; - BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : 0; + BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : nullptr; // Then remove the rest. for (unsigned I = 0, E = OldTerm->getNumSuccessors(); I != E; ++I) { BasicBlock *Succ = OldTerm->getSuccessor(I); // Make sure only to keep exactly one copy of each edge. if (Succ == KeepEdge1) - KeepEdge1 = 0; + KeepEdge1 = nullptr; else if (Succ == KeepEdge2) - KeepEdge2 = 0; + KeepEdge2 = nullptr; else Succ->removePredecessor(OldTerm->getParent()); } @@ -2516,7 +2540,7 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond, Builder.SetCurrentDebugLocation(OldTerm->getDebugLoc()); // Insert an appropriate new terminator. - if ((KeepEdge1 == 0) && (KeepEdge2 == 0)) { + if (!KeepEdge1 && !KeepEdge2) { if (TrueBB == FalseBB) // We were only looking for one successor, and it was present. // Create an unconditional branch to it. @@ -2538,7 +2562,7 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond, // One of the selected values was a successor, but the other wasn't. // Insert an unconditional branch to the one that was found; // the edge to the one that wasn't must be unreachable. - if (KeepEdge1 == 0) + if (!KeepEdge1) // Only TrueBB was found. Builder.CreateBr(TrueBB); else @@ -2625,7 +2649,7 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) { /// the PHI, merging the third icmp into the switch. static bool TryToSimplifyUncondBranchWithICmpInIt( ICmpInst *ICI, IRBuilder<> &Builder, const TargetTransformInfo &TTI, - const DataLayout *TD) { + const DataLayout *DL) { BasicBlock *BB = ICI->getParent(); // If the block has any PHIs in it or the icmp has multiple uses, it is too @@ -2639,7 +2663,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt( // 'V' and this block is the default case for the switch. In this case we can // fold the compared value into the switch to simplify things. BasicBlock *Pred = BB->getSinglePredecessor(); - if (Pred == 0 || !isa<SwitchInst>(Pred->getTerminator())) return false; + if (!Pred || !isa<SwitchInst>(Pred->getTerminator())) return false; SwitchInst *SI = cast<SwitchInst>(Pred->getTerminator()); if (SI->getCondition() != V) @@ -2653,12 +2677,12 @@ static bool TryToSimplifyUncondBranchWithICmpInIt( assert(VVal && "Should have a unique destination value"); ICI->setOperand(0, VVal); - if (Value *V = SimplifyInstruction(ICI, TD)) { + if (Value *V = SimplifyInstruction(ICI, DL)) { ICI->replaceAllUsesWith(V); ICI->eraseFromParent(); } // BB is now empty, so it is likely to simplify away. - return SimplifyCFG(BB, TTI, TD) | true; + return SimplifyCFG(BB, TTI, DL) | true; } // Ok, the block is reachable from the default dest. If the constant we're @@ -2674,14 +2698,14 @@ static bool TryToSimplifyUncondBranchWithICmpInIt( ICI->replaceAllUsesWith(V); ICI->eraseFromParent(); // BB is now empty, so it is likely to simplify away. - return SimplifyCFG(BB, TTI, TD) | true; + return SimplifyCFG(BB, TTI, DL) | true; } // The use of the icmp has to be in the 'end' block, by the only PHI node in // the block. BasicBlock *SuccBlock = BB->getTerminator()->getSuccessor(0); - PHINode *PHIUse = dyn_cast<PHINode>(ICI->use_back()); - if (PHIUse == 0 || PHIUse != &SuccBlock->front() || + PHINode *PHIUse = dyn_cast<PHINode>(ICI->user_back()); + if (PHIUse == nullptr || PHIUse != &SuccBlock->front() || isa<PHINode>(++BasicBlock::iterator(PHIUse))) return false; @@ -2730,32 +2754,32 @@ static bool TryToSimplifyUncondBranchWithICmpInIt( /// SimplifyBranchOnICmpChain - The specified branch is a conditional branch. /// Check to see if it is branching on an or/and chain of icmp instructions, and /// fold it into a switch instruction if so. -static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *TD, +static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *DL, IRBuilder<> &Builder) { Instruction *Cond = dyn_cast<Instruction>(BI->getCondition()); - if (Cond == 0) return false; + if (!Cond) return false; // Change br (X == 0 | X == 1), T, F into a switch instruction. // If this is a bunch of seteq's or'd together, or if it's a bunch of // 'setne's and'ed together, collect them. - Value *CompVal = 0; + Value *CompVal = nullptr; std::vector<ConstantInt*> Values; bool TrueWhenEqual = true; - Value *ExtraCase = 0; + Value *ExtraCase = nullptr; unsigned UsedICmps = 0; if (Cond->getOpcode() == Instruction::Or) { - CompVal = GatherConstantCompares(Cond, Values, ExtraCase, TD, true, + CompVal = GatherConstantCompares(Cond, Values, ExtraCase, DL, true, UsedICmps); } else if (Cond->getOpcode() == Instruction::And) { - CompVal = GatherConstantCompares(Cond, Values, ExtraCase, TD, false, + CompVal = GatherConstantCompares(Cond, Values, ExtraCase, DL, false, UsedICmps); TrueWhenEqual = false; } // If we didn't have a multiply compared value, fail. - if (CompVal == 0) return false; + if (!CompVal) return false; // Avoid turning single icmps into a switch. if (UsedICmps <= 1) @@ -2811,9 +2835,9 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *TD, Builder.SetInsertPoint(BI); // Convert pointer to int before we switch. if (CompVal->getType()->isPointerTy()) { - assert(TD && "Cannot switch on pointer without DataLayout"); + assert(DL && "Cannot switch on pointer without DataLayout"); CompVal = Builder.CreatePtrToInt(CompVal, - TD->getIntPtrType(CompVal->getType()), + DL->getIntPtrType(CompVal->getType()), "magicptr"); } @@ -3050,7 +3074,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { // Find the most popular block. unsigned MaxPop = 0; unsigned MaxIndex = 0; - BasicBlock *MaxBlock = 0; + BasicBlock *MaxBlock = nullptr; for (std::map<BasicBlock*, std::pair<unsigned, unsigned> >::iterator I = Popularity.begin(), E = Popularity.end(); I != E; ++I) { if (I->second.first > MaxPop || @@ -3188,7 +3212,7 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI) { Value *Cond = SI->getCondition(); unsigned Bits = Cond->getType()->getIntegerBitWidth(); APInt KnownZero(Bits, 0), KnownOne(Bits, 0); - ComputeMaskedBits(Cond, KnownZero, KnownOne); + computeKnownBits(Cond, KnownZero, KnownOne); // Gather dead cases. SmallVector<ConstantInt*, 8> DeadCases; @@ -3222,7 +3246,7 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI) { Case.getCaseSuccessor()->removePredecessor(SI->getParent()); SI->removeCase(Case); } - if (HasWeight) { + if (HasWeight && Weights.size() >= 2) { SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end()); SI->setMetadata(LLVMContext::MD_prof, MDBuilder(SI->getParent()->getContext()). @@ -3241,13 +3265,13 @@ static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue, BasicBlock *BB, int *PhiIndex) { if (BB->getFirstNonPHIOrDbg() != BB->getTerminator()) - return NULL; // BB must be empty to be a candidate for simplification. + return nullptr; // BB must be empty to be a candidate for simplification. if (!BB->getSinglePredecessor()) - return NULL; // BB must be dominated by the switch. + return nullptr; // BB must be dominated by the switch. BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); if (!Branch || !Branch->isUnconditional()) - return NULL; // Terminator must be unconditional branch. + return nullptr; // Terminator must be unconditional branch. BasicBlock *Succ = Branch->getSuccessor(0); @@ -3263,7 +3287,7 @@ static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue, return PHI; } - return NULL; + return nullptr; } /// ForwardSwitchConditionToPHI - Try to forward the condition of a switch @@ -3306,6 +3330,11 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) { /// ValidLookupTableConstant - Return true if the backend will be able to handle /// initializing an array of constants like C. static bool ValidLookupTableConstant(Constant *C) { + if (C->isThreadDependent()) + return false; + if (C->isDLLImportDependent()) + return false; + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) return CE->isGEPWithNoNotionalOverIndexing(); @@ -3336,12 +3365,12 @@ ConstantFold(Instruction *I, if (SelectInst *Select = dyn_cast<SelectInst>(I)) { Constant *A = LookupConstant(Select->getCondition(), ConstantPool); if (!A) - return 0; + return nullptr; if (A->isAllOnesValue()) return LookupConstant(Select->getTrueValue(), ConstantPool); if (A->isNullValue()) return LookupConstant(Select->getFalseValue(), ConstantPool); - return 0; + return nullptr; } SmallVector<Constant *, 4> COps; @@ -3349,7 +3378,7 @@ ConstantFold(Instruction *I, if (Constant *A = LookupConstant(I->getOperand(N), ConstantPool)) COps.push_back(A); else - return 0; + return nullptr; } if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) @@ -3428,7 +3457,7 @@ GetCaseResults(SwitchInst *SI, Res.push_back(std::make_pair(PHI, ConstVal)); } - return true; + return Res.size() > 0; } namespace { @@ -3444,7 +3473,7 @@ namespace { ConstantInt *Offset, const SmallVectorImpl<std::pair<ConstantInt*, Constant*> >& Values, Constant *DefaultValue, - const DataLayout *TD); + const DataLayout *DL); /// BuildLookup - Build instructions with Builder to retrieve the value at /// the position given by Index in the lookup table. @@ -3452,7 +3481,7 @@ namespace { /// WouldFitInRegister - Return true if a table with TableSize elements of /// type ElementType would fit in a target-legal register. - static bool WouldFitInRegister(const DataLayout *TD, + static bool WouldFitInRegister(const DataLayout *DL, uint64_t TableSize, const Type *ElementType); @@ -3491,38 +3520,44 @@ SwitchLookupTable::SwitchLookupTable(Module &M, ConstantInt *Offset, const SmallVectorImpl<std::pair<ConstantInt*, Constant*> >& Values, Constant *DefaultValue, - const DataLayout *TD) - : SingleValue(0), BitMap(0), BitMapElementTy(0), Array(0) { + const DataLayout *DL) + : SingleValue(nullptr), BitMap(nullptr), BitMapElementTy(nullptr), + Array(nullptr) { assert(Values.size() && "Can't build lookup table without values!"); assert(TableSize >= Values.size() && "Can't fit values in table!"); // If all values in the table are equal, this is that value. SingleValue = Values.begin()->second; + Type *ValueType = Values.begin()->second->getType(); + // Build up the table contents. SmallVector<Constant*, 64> TableContents(TableSize); for (size_t I = 0, E = Values.size(); I != E; ++I) { ConstantInt *CaseVal = Values[I].first; Constant *CaseRes = Values[I].second; - assert(CaseRes->getType() == DefaultValue->getType()); + assert(CaseRes->getType() == ValueType); uint64_t Idx = (CaseVal->getValue() - Offset->getValue()) .getLimitedValue(); TableContents[Idx] = CaseRes; if (CaseRes != SingleValue) - SingleValue = 0; + SingleValue = nullptr; } // Fill in any holes in the table with the default result. if (Values.size() < TableSize) { + assert(DefaultValue && + "Need a default value to fill the lookup table holes."); + assert(DefaultValue->getType() == ValueType); for (uint64_t I = 0; I < TableSize; ++I) { if (!TableContents[I]) TableContents[I] = DefaultValue; } if (DefaultValue != SingleValue) - SingleValue = 0; + SingleValue = nullptr; } // If each element in the table contains the same value, we only need to store @@ -3533,8 +3568,8 @@ SwitchLookupTable::SwitchLookupTable(Module &M, } // If the type is integer and the table fits in a register, build a bitmap. - if (WouldFitInRegister(TD, TableSize, DefaultValue->getType())) { - IntegerType *IT = cast<IntegerType>(DefaultValue->getType()); + if (WouldFitInRegister(DL, TableSize, ValueType)) { + IntegerType *IT = cast<IntegerType>(ValueType); APInt TableInt(TableSize * IT->getBitWidth(), 0); for (uint64_t I = TableSize; I > 0; --I) { TableInt <<= IT->getBitWidth(); @@ -3552,7 +3587,7 @@ SwitchLookupTable::SwitchLookupTable(Module &M, } // Store the table in an array. - ArrayType *ArrayTy = ArrayType::get(DefaultValue->getType(), TableSize); + ArrayType *ArrayTy = ArrayType::get(ValueType, TableSize); Constant *Initializer = ConstantArray::get(ArrayTy, TableContents); Array = new GlobalVariable(M, ArrayTy, /*constant=*/ true, @@ -3589,6 +3624,16 @@ Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) { "switch.masked"); } case ArrayKind: { + // Make sure the table index will not overflow when treated as signed. + IntegerType *IT = cast<IntegerType>(Index->getType()); + uint64_t TableSize = Array->getInitializer()->getType() + ->getArrayNumElements(); + if (TableSize > (1ULL << (IT->getBitWidth() - 1))) + Index = Builder.CreateZExt(Index, + IntegerType::get(IT->getContext(), + IT->getBitWidth() + 1), + "switch.tableidx.zext"); + Value *GEPIndices[] = { Builder.getInt32(0), Index }; Value *GEP = Builder.CreateInBoundsGEP(Array, GEPIndices, "switch.gep"); @@ -3598,10 +3643,10 @@ Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) { llvm_unreachable("Unknown lookup table kind!"); } -bool SwitchLookupTable::WouldFitInRegister(const DataLayout *TD, +bool SwitchLookupTable::WouldFitInRegister(const DataLayout *DL, uint64_t TableSize, const Type *ElementType) { - if (!TD) + if (!DL) return false; const IntegerType *IT = dyn_cast<IntegerType>(ElementType); if (!IT) @@ -3612,7 +3657,7 @@ bool SwitchLookupTable::WouldFitInRegister(const DataLayout *TD, // Avoid overflow, fitsInLegalInteger uses unsigned int for the width. if (TableSize >= UINT_MAX/IT->getBitWidth()) return false; - return TD->fitsInLegalInteger(TableSize * IT->getBitWidth()); + return DL->fitsInLegalInteger(TableSize * IT->getBitWidth()); } /// ShouldBuildLookupTable - Determine whether a lookup table should be built @@ -3621,7 +3666,7 @@ bool SwitchLookupTable::WouldFitInRegister(const DataLayout *TD, static bool ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize, const TargetTransformInfo &TTI, - const DataLayout *TD, + const DataLayout *DL, const SmallDenseMap<PHINode*, Type*>& ResultTypes) { if (SI->getNumCases() > TableSize || TableSize >= UINT64_MAX / 10) return false; // TableSize overflowed, or mul below might overflow. @@ -3637,7 +3682,7 @@ static bool ShouldBuildLookupTable(SwitchInst *SI, // Saturate this flag to false. AllTablesFitInRegister = AllTablesFitInRegister && - SwitchLookupTable::WouldFitInRegister(TD, TableSize, Ty); + SwitchLookupTable::WouldFitInRegister(DL, TableSize, Ty); // If both flags saturate, we're done. NOTE: This *only* works with // saturating flags, and all flags have to saturate first due to the @@ -3666,7 +3711,7 @@ static bool ShouldBuildLookupTable(SwitchInst *SI, static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, const TargetTransformInfo &TTI, - const DataLayout* TD) { + const DataLayout* DL) { assert(SI->getNumCases() > 1 && "Degenerate switch?"); // Only build lookup table when we have a target that supports it. @@ -3680,11 +3725,9 @@ static bool SwitchToLookupTable(SwitchInst *SI, // GEP needs a runtime relocation in PIC code. We should just build one big // string and lookup indices into that. - // Ignore the switch if the number of cases is too small. - // This is similar to the check when building jump tables in - // SelectionDAGBuilder::handleJTSwitchCase. - // FIXME: Determine the best cut-off. - if (SI->getNumCases() < 4) + // Ignore switches with less than three cases. Lookup tables will not make them + // faster, so we don't analyze them. + if (SI->getNumCases() < 3) return false; // Figure out the corresponding result for each case value and phi node in the @@ -3694,7 +3737,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, ConstantInt *MinCaseVal = CI.getCaseValue(); ConstantInt *MaxCaseVal = CI.getCaseValue(); - BasicBlock *CommonDest = 0; + BasicBlock *CommonDest = nullptr; typedef SmallVector<std::pair<ConstantInt*, Constant*>, 4> ResultListTy; SmallDenseMap<PHINode*, ResultListTy> ResultLists; SmallDenseMap<PHINode*, Constant*> DefaultResults; @@ -3712,7 +3755,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, typedef SmallVector<std::pair<PHINode*, Constant*>, 4> ResultsTy; ResultsTy Results; if (!GetCaseResults(SI, CaseVal, CI.getCaseSuccessor(), &CommonDest, - Results, TD)) + Results, DL)) return false; // Append the result from this case to the list for each phi. @@ -3723,21 +3766,41 @@ static bool SwitchToLookupTable(SwitchInst *SI, } } - // Get the resulting values for the default case. + // Keep track of the result types. + for (size_t I = 0, E = PHIs.size(); I != E; ++I) { + PHINode *PHI = PHIs[I]; + ResultTypes[PHI] = ResultLists[PHI][0].second->getType(); + } + + uint64_t NumResults = ResultLists[PHIs[0]].size(); + APInt RangeSpread = MaxCaseVal->getValue() - MinCaseVal->getValue(); + uint64_t TableSize = RangeSpread.getLimitedValue() + 1; + bool TableHasHoles = (NumResults < TableSize); + + // If the table has holes, we need a constant result for the default case + // or a bitmask that fits in a register. SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList; - if (!GetCaseResults(SI, 0, SI->getDefaultDest(), &CommonDest, - DefaultResultsList, TD)) - return false; + bool HasDefaultResults = false; + if (TableHasHoles) { + HasDefaultResults = GetCaseResults(SI, nullptr, SI->getDefaultDest(), + &CommonDest, DefaultResultsList, DL); + } + bool NeedMask = (TableHasHoles && !HasDefaultResults); + if (NeedMask) { + // As an extra penalty for the validity test we require more cases. + if (SI->getNumCases() < 4) // FIXME: Find best threshold value (benchmark). + return false; + if (!(DL && DL->fitsInLegalInteger(TableSize))) + return false; + } + for (size_t I = 0, E = DefaultResultsList.size(); I != E; ++I) { PHINode *PHI = DefaultResultsList[I].first; Constant *Result = DefaultResultsList[I].second; DefaultResults[PHI] = Result; - ResultTypes[PHI] = Result->getType(); } - APInt RangeSpread = MaxCaseVal->getValue() - MinCaseVal->getValue(); - uint64_t TableSize = RangeSpread.getLimitedValue() + 1; - if (!ShouldBuildLookupTable(SI, TableSize, TTI, TD, ResultTypes)) + if (!ShouldBuildLookupTable(SI, TableSize, TTI, DL, ResultTypes)) return false; // Create the BB that does the lookups. @@ -3755,7 +3818,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, // Compute the maximum table size representable by the integer type we are // switching upon. unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits(); - uint64_t MaxTableSize = CaseSize > 63? UINT64_MAX : 1ULL << CaseSize; + uint64_t MaxTableSize = CaseSize > 63 ? UINT64_MAX : 1ULL << CaseSize; assert(MaxTableSize >= TableSize && "It is impossible for a switch to have more entries than the max " "representable value of its input integer type's size."); @@ -3770,25 +3833,67 @@ static bool SwitchToLookupTable(SwitchInst *SI, SI->getDefaultDest()->removePredecessor(SI->getParent()); } else { Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get( - MinCaseVal->getType(), TableSize)); + MinCaseVal->getType(), TableSize)); Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest()); } // Populate the BB that does the lookups. Builder.SetInsertPoint(LookupBB); + + if (NeedMask) { + // Before doing the lookup we do the hole check. + // The LookupBB is therefore re-purposed to do the hole check + // and we create a new LookupBB. + BasicBlock *MaskBB = LookupBB; + MaskBB->setName("switch.hole_check"); + LookupBB = BasicBlock::Create(Mod.getContext(), + "switch.lookup", + CommonDest->getParent(), + CommonDest); + + // Build bitmask; fill in a 1 bit for every case. + APInt MaskInt(TableSize, 0); + APInt One(TableSize, 1); + const ResultListTy &ResultList = ResultLists[PHIs[0]]; + for (size_t I = 0, E = ResultList.size(); I != E; ++I) { + uint64_t Idx = (ResultList[I].first->getValue() - + MinCaseVal->getValue()).getLimitedValue(); + MaskInt |= One << Idx; + } + ConstantInt *TableMask = ConstantInt::get(Mod.getContext(), MaskInt); + + // Get the TableIndex'th bit of the bitmask. + // If this bit is 0 (meaning hole) jump to the default destination, + // else continue with table lookup. + IntegerType *MapTy = TableMask->getType(); + Value *MaskIndex = Builder.CreateZExtOrTrunc(TableIndex, MapTy, + "switch.maskindex"); + Value *Shifted = Builder.CreateLShr(TableMask, MaskIndex, + "switch.shifted"); + Value *LoBit = Builder.CreateTrunc(Shifted, + Type::getInt1Ty(Mod.getContext()), + "switch.lobit"); + Builder.CreateCondBr(LoBit, LookupBB, SI->getDefaultDest()); + + Builder.SetInsertPoint(LookupBB); + AddPredecessorToBlock(SI->getDefaultDest(), MaskBB, SI->getParent()); + } + bool ReturnedEarly = false; for (size_t I = 0, E = PHIs.size(); I != E; ++I) { PHINode *PHI = PHIs[I]; + // If using a bitmask, use any value to fill the lookup table holes. + Constant *DV = NeedMask ? ResultLists[PHI][0].second : DefaultResults[PHI]; SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultLists[PHI], - DefaultResults[PHI], TD); + DV, DL); Value *Result = Table.BuildLookup(TableIndex, Builder); // If the result is used to return immediately from the function, we want to // do that right here. - if (PHI->hasOneUse() && isa<ReturnInst>(*PHI->use_begin()) && - *PHI->use_begin() == CommonDest->getFirstNonPHIOrDbg()) { + if (PHI->hasOneUse() && isa<ReturnInst>(*PHI->user_begin()) && + PHI->user_back() == CommonDest->getFirstNonPHIOrDbg()) { Builder.CreateRet(Result); ReturnedEarly = true; break; @@ -3811,6 +3916,8 @@ static bool SwitchToLookupTable(SwitchInst *SI, SI->eraseFromParent(); ++NumLookupTables; + if (NeedMask) + ++NumLookupTablesHoles; return true; } @@ -3822,12 +3929,12 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { // see if that predecessor totally determines the outcome of this switch. if (BasicBlock *OnlyPred = BB->getSinglePredecessor()) if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder)) - return SimplifyCFG(BB, TTI, TD) | true; + return SimplifyCFG(BB, TTI, DL) | true; Value *Cond = SI->getCondition(); if (SelectInst *Select = dyn_cast<SelectInst>(Cond)) if (SimplifySwitchOnSelect(SI, Select)) - return SimplifyCFG(BB, TTI, TD) | true; + return SimplifyCFG(BB, TTI, DL) | true; // If the block only contains the switch, see if we can fold the block // away into any preds. @@ -3837,22 +3944,22 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { ++BBI; if (SI == &*BBI) if (FoldValueComparisonIntoPredecessors(SI, Builder)) - return SimplifyCFG(BB, TTI, TD) | true; + return SimplifyCFG(BB, TTI, DL) | true; } // Try to transform the switch into an icmp and a branch. if (TurnSwitchRangeIntoICmp(SI, Builder)) - return SimplifyCFG(BB, TTI, TD) | true; + return SimplifyCFG(BB, TTI, DL) | true; // Remove unreachable cases. if (EliminateDeadSwitchCases(SI)) - return SimplifyCFG(BB, TTI, TD) | true; + return SimplifyCFG(BB, TTI, DL) | true; if (ForwardSwitchConditionToPHI(SI)) - return SimplifyCFG(BB, TTI, TD) | true; + return SimplifyCFG(BB, TTI, DL) | true; - if (SwitchToLookupTable(SI, Builder, TTI, TD)) - return SimplifyCFG(BB, TTI, TD) | true; + if (SwitchToLookupTable(SI, Builder, TTI, DL)) + return SimplifyCFG(BB, TTI, DL) | true; return false; } @@ -3889,7 +3996,7 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) { if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) { if (SimplifyIndirectBrOnSelect(IBI, SI)) - return SimplifyCFG(BB, TTI, TD) | true; + return SimplifyCFG(BB, TTI, DL) | true; } return Changed; } @@ -3913,7 +4020,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){ for (++I; isa<DbgInfoIntrinsic>(I); ++I) ; if (I->isTerminator() && - TryToSimplifyUncondBranchWithICmpInIt(ICI, Builder, TTI, TD)) + TryToSimplifyUncondBranchWithICmpInIt(ICI, Builder, TTI, DL)) return true; } @@ -3921,8 +4028,8 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){ // branches to us and our successor, fold the comparison into the // predecessor and use logical operations to update the incoming value // for PHI nodes in common successor. - if (FoldBranchToCommonDest(BI)) - return SimplifyCFG(BB, TTI, TD) | true; + if (FoldBranchToCommonDest(BI, DL)) + return SimplifyCFG(BB, TTI, DL) | true; return false; } @@ -3937,7 +4044,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { // switch. if (BasicBlock *OnlyPred = BB->getSinglePredecessor()) if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder)) - return SimplifyCFG(BB, TTI, TD) | true; + return SimplifyCFG(BB, TTI, DL) | true; // This block must be empty, except for the setcond inst, if it exists. // Ignore dbg intrinsics. @@ -3947,67 +4054,67 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { ++I; if (&*I == BI) { if (FoldValueComparisonIntoPredecessors(BI, Builder)) - return SimplifyCFG(BB, TTI, TD) | true; + return SimplifyCFG(BB, TTI, DL) | true; } else if (&*I == cast<Instruction>(BI->getCondition())){ ++I; // Ignore dbg intrinsics. while (isa<DbgInfoIntrinsic>(I)) ++I; if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder)) - return SimplifyCFG(BB, TTI, TD) | true; + return SimplifyCFG(BB, TTI, DL) | true; } } // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction. - if (SimplifyBranchOnICmpChain(BI, TD, Builder)) + if (SimplifyBranchOnICmpChain(BI, DL, Builder)) return true; // If this basic block is ONLY a compare and a branch, and if a predecessor // branches to us and one of our successors, fold the comparison into the // predecessor and use logical operations to pick the right destination. - if (FoldBranchToCommonDest(BI)) - return SimplifyCFG(BB, TTI, TD) | true; + if (FoldBranchToCommonDest(BI, DL)) + return SimplifyCFG(BB, TTI, DL) | true; // We have a conditional branch to two blocks that are only reachable // from BI. We know that the condbr dominates the two blocks, so see if // there is any identical code in the "then" and "else" blocks. If so, we // can hoist it up to the branching block. - if (BI->getSuccessor(0)->getSinglePredecessor() != 0) { - if (BI->getSuccessor(1)->getSinglePredecessor() != 0) { - if (HoistThenElseCodeToIf(BI)) - return SimplifyCFG(BB, TTI, TD) | true; + if (BI->getSuccessor(0)->getSinglePredecessor()) { + if (BI->getSuccessor(1)->getSinglePredecessor()) { + if (HoistThenElseCodeToIf(BI, DL)) + return SimplifyCFG(BB, TTI, DL) | true; } else { // If Successor #1 has multiple preds, we may be able to conditionally - // execute Successor #0 if it branches to successor #1. + // execute Successor #0 if it branches to Successor #1. TerminatorInst *Succ0TI = BI->getSuccessor(0)->getTerminator(); if (Succ0TI->getNumSuccessors() == 1 && Succ0TI->getSuccessor(0) == BI->getSuccessor(1)) - if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0))) - return SimplifyCFG(BB, TTI, TD) | true; + if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), DL)) + return SimplifyCFG(BB, TTI, DL) | true; } - } else if (BI->getSuccessor(1)->getSinglePredecessor() != 0) { + } else if (BI->getSuccessor(1)->getSinglePredecessor()) { // If Successor #0 has multiple preds, we may be able to conditionally - // execute Successor #1 if it branches to successor #0. + // execute Successor #1 if it branches to Successor #0. TerminatorInst *Succ1TI = BI->getSuccessor(1)->getTerminator(); if (Succ1TI->getNumSuccessors() == 1 && Succ1TI->getSuccessor(0) == BI->getSuccessor(0)) - if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1))) - return SimplifyCFG(BB, TTI, TD) | true; + if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), DL)) + return SimplifyCFG(BB, TTI, DL) | true; } // If this is a branch on a phi node in the current block, thread control // through this block if any PHI node entries are constants. if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition())) if (PN->getParent() == BI->getParent()) - if (FoldCondBranchOnPHI(BI, TD)) - return SimplifyCFG(BB, TTI, TD) | true; + if (FoldCondBranchOnPHI(BI, DL)) + return SimplifyCFG(BB, TTI, DL) | true; // Scan predecessor blocks for conditional branches. for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator())) if (PBI != BI && PBI->isConditional()) if (SimplifyCondBranchToCondBranch(PBI, BI)) - return SimplifyCFG(BB, TTI, TD) | true; + return SimplifyCFG(BB, TTI, DL) | true; return false; } @@ -4023,7 +4130,7 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) { if (C->isNullValue()) { // Only look at the first use, avoid hurting compile time with long uselists - User *Use = *I->use_begin(); + User *Use = *I->user_begin(); // Now make sure that there are no instructions in between that can alter // control flow (eg. calls) @@ -4119,7 +4226,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) { // eliminate it, do so now. if (PHINode *PN = dyn_cast<PHINode>(BB->begin())) if (PN->getNumIncomingValues() == 2) - Changed |= FoldTwoEntryPHINode(PN, TD); + Changed |= FoldTwoEntryPHINode(PN, DL); Builder.SetInsertPoint(BB->getTerminator()); if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { @@ -4151,6 +4258,6 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) { /// of the CFG. It returns true if a modification was made. /// bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI, - const DataLayout *TD) { - return SimplifyCFGOpt(TTI, TD).run(BB); + const DataLayout *DL) { + return SimplifyCFGOpt(TTI, DL).run(BB); } diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index bf3442a..b284e6f 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -13,9 +13,8 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "indvars" - #include "llvm/Transforms/Utils/SimplifyIndVar.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/IVUsers.h" @@ -23,13 +22,18 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; +#define DEBUG_TYPE "indvars" + STATISTIC(NumElimIdentity, "Number of IV identities eliminated"); STATISTIC(NumElimOperand, "Number of IV operands folded into a use"); STATISTIC(NumElimRem , "Number of IV remainder operations eliminated"); @@ -44,7 +48,7 @@ namespace { Loop *L; LoopInfo *LI; ScalarEvolution *SE; - const DataLayout *TD; // May be NULL + const DataLayout *DL; // May be NULL SmallVectorImpl<WeakVH> &DeadInsts; @@ -52,13 +56,14 @@ namespace { public: SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, LPPassManager *LPM, - SmallVectorImpl<WeakVH> &Dead, IVUsers *IVU = NULL) : + SmallVectorImpl<WeakVH> &Dead, IVUsers *IVU = nullptr) : L(Loop), LI(LPM->getAnalysisIfAvailable<LoopInfo>()), SE(SE), - TD(LPM->getAnalysisIfAvailable<DataLayout>()), DeadInsts(Dead), Changed(false) { + DataLayoutPass *DLP = LPM->getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; assert(LI && "IV simplification requires LoopInfo"); } @@ -67,7 +72,7 @@ namespace { /// Iteratively perform simplification on a worklist of users of the /// specified induction variable. This is the top-level driver that applies /// all simplicitions to users of an IV. - void simplifyUsers(PHINode *CurrIV, IVVisitor *V = NULL); + void simplifyUsers(PHINode *CurrIV, IVVisitor *V = nullptr); Value *foldIVUser(Instruction *UseInst, Instruction *IVOperand); @@ -75,6 +80,9 @@ namespace { void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand); void eliminateIVRemainder(BinaryOperator *Rem, Value *IVOperand, bool IsSigned); + + Instruction *splitOverflowIntrinsic(Instruction *IVUser, + const DominatorTree *DT); }; } @@ -87,25 +95,25 @@ namespace { /// be folded (in case more folding opportunities have been exposed). /// Otherwise return null. Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) { - Value *IVSrc = 0; + Value *IVSrc = nullptr; unsigned OperIdx = 0; - const SCEV *FoldedExpr = 0; + const SCEV *FoldedExpr = nullptr; switch (UseInst->getOpcode()) { default: - return 0; + return nullptr; case Instruction::UDiv: case Instruction::LShr: // We're only interested in the case where we know something about // the numerator and have a constant denominator. if (IVOperand != UseInst->getOperand(OperIdx) || !isa<ConstantInt>(UseInst->getOperand(1))) - return 0; + return nullptr; // Attempt to fold a binary operator with constant operand. // e.g. ((I + 1) >> 2) => I >> 2 if (!isa<BinaryOperator>(IVOperand) || !isa<ConstantInt>(IVOperand->getOperand(1))) - return 0; + return nullptr; IVSrc = IVOperand->getOperand(0); // IVSrc must be the (SCEVable) IV, since the other operand is const. @@ -116,7 +124,7 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) // Get a constant for the divisor. See createSCEV. uint32_t BitWidth = cast<IntegerType>(UseInst->getType())->getBitWidth(); if (D->getValue().uge(BitWidth)) - return 0; + return nullptr; D = ConstantInt::get(UseInst->getContext(), APInt::getOneBitSet(BitWidth, D->getZExtValue())); @@ -125,11 +133,11 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) } // We have something that might fold it's operand. Compare SCEVs. if (!SE->isSCEVable(UseInst->getType())) - return 0; + return nullptr; // Bypass the operand if SCEV can prove it has no effect. if (SE->getSCEV(UseInst) != FoldedExpr) - return 0; + return nullptr; DEBUG(dbgs() << "INDVARS: Eliminated IV operand: " << *IVOperand << " -> " << *UseInst << '\n'); @@ -263,6 +271,69 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst, return true; } +/// \brief Split sadd.with.overflow into add + sadd.with.overflow to allow +/// analysis and optimization. +/// +/// \return A new value representing the non-overflowing add if possible, +/// otherwise return the original value. +Instruction *SimplifyIndvar::splitOverflowIntrinsic(Instruction *IVUser, + const DominatorTree *DT) { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(IVUser); + if (!II || II->getIntrinsicID() != Intrinsic::sadd_with_overflow) + return IVUser; + + // Find a branch guarded by the overflow check. + BranchInst *Branch = nullptr; + Instruction *AddVal = nullptr; + for (User *U : II->users()) { + if (ExtractValueInst *ExtractInst = dyn_cast<ExtractValueInst>(U)) { + if (ExtractInst->getNumIndices() != 1) + continue; + if (ExtractInst->getIndices()[0] == 0) + AddVal = ExtractInst; + else if (ExtractInst->getIndices()[0] == 1 && ExtractInst->hasOneUse()) + Branch = dyn_cast<BranchInst>(ExtractInst->user_back()); + } + } + if (!AddVal || !Branch) + return IVUser; + + BasicBlock *ContinueBB = Branch->getSuccessor(1); + if (std::next(pred_begin(ContinueBB)) != pred_end(ContinueBB)) + return IVUser; + + // Check if all users of the add are provably NSW. + bool AllNSW = true; + for (Use &U : AddVal->uses()) { + if (Instruction *UseInst = dyn_cast<Instruction>(U.getUser())) { + BasicBlock *UseBB = UseInst->getParent(); + if (PHINode *PHI = dyn_cast<PHINode>(UseInst)) + UseBB = PHI->getIncomingBlock(U); + if (!DT->dominates(ContinueBB, UseBB)) { + AllNSW = false; + break; + } + } + } + if (!AllNSW) + return IVUser; + + // Go for it... + IRBuilder<> Builder(IVUser); + Instruction *AddInst = dyn_cast<Instruction>( + Builder.CreateNSWAdd(II->getOperand(0), II->getOperand(1))); + + // The caller expects the new add to have the same form as the intrinsic. The + // IV operand position must be the same. + assert((AddInst->getOpcode() == Instruction::Add && + AddInst->getOperand(0) == II->getOperand(0)) && + "Bad add instruction created from overflow intrinsic."); + + AddVal->replaceAllUsesWith(AddInst); + DeadInsts.push_back(AddVal); + return AddInst; +} + /// pushIVUsers - Add all uses of Def to the current IV's worklist. /// static void pushIVUsers( @@ -270,16 +341,15 @@ static void pushIVUsers( SmallPtrSet<Instruction*,16> &Simplified, SmallVectorImpl< std::pair<Instruction*,Instruction*> > &SimpleIVUsers) { - for (Value::use_iterator UI = Def->use_begin(), E = Def->use_end(); - UI != E; ++UI) { - Instruction *User = cast<Instruction>(*UI); + for (User *U : Def->users()) { + Instruction *UI = cast<Instruction>(U); // Avoid infinite or exponential worklist processing. // Also ensure unique worklist users. // If Def is a LoopPhi, it may not be in the Simplified set, so check for // self edges first. - if (User != Def && Simplified.insert(User)) - SimpleIVUsers.push_back(std::make_pair(User, Def)); + if (UI != Def && Simplified.insert(UI)) + SimpleIVUsers.push_back(std::make_pair(UI, Def)); } } @@ -334,8 +404,16 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) { while (!SimpleIVUsers.empty()) { std::pair<Instruction*, Instruction*> UseOper = SimpleIVUsers.pop_back_val(); + Instruction *UseInst = UseOper.first; + // Bypass back edges to avoid extra work. - if (UseOper.first == CurrIV) continue; + if (UseInst == CurrIV) continue; + + if (V && V->shouldSplitOverflowInstrinsics()) { + UseInst = splitOverflowIntrinsic(UseInst, V->getDomTree()); + if (!UseInst) + continue; + } Instruction *IVOperand = UseOper.second; for (unsigned N = 0; IVOperand; ++N) { diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp index f9687e4d..33b3637 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp @@ -14,14 +14,13 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "instsimplify" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" @@ -29,6 +28,8 @@ #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +#define DEBUG_TYPE "instsimplify" + STATISTIC(NumSimplified, "Number of redundant instructions removed"); namespace { @@ -38,23 +39,27 @@ namespace { initializeInstSimplifierPass(*PassRegistry::getPassRegistry()); } - void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<TargetLibraryInfo>(); } /// runOnFunction - Remove instructions that simplify. - bool runOnFunction(Function &F) { - const DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>(); - const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); + bool runOnFunction(Function &F) override { + const DominatorTreeWrapperPass *DTWP = + getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2; bool Changed = false; do { - for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()), - DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) - for (BasicBlock::iterator BI = DI->begin(), BE = DI->end(); BI != BE;) { + for (BasicBlock *BB : depth_first(&F.getEntryBlock())) + // Here be subtlety: the iterator must be incremented before the loop + // body (not sure why), so a range-for loop won't work here. + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { Instruction *I = BI++; // The first time through the loop ToSimplify is empty and we try to // simplify all instructions. On later iterations ToSimplify is not @@ -63,16 +68,23 @@ namespace { continue; // Don't waste time simplifying unused instructions. if (!I->use_empty()) - if (Value *V = SimplifyInstruction(I, TD, TLI, DT)) { + if (Value *V = SimplifyInstruction(I, DL, TLI, DT)) { // Mark all uses for resimplification next time round the loop. - for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); - UI != UE; ++UI) - Next->insert(cast<Instruction>(*UI)); + for (User *U : I->users()) + Next->insert(cast<Instruction>(U)); I->replaceAllUsesWith(V); ++NumSimplified; Changed = true; } - Changed |= RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + bool res = RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + if (res) { + // RecursivelyDeleteTriviallyDeadInstruction can remove + // more than one instruction, so simply incrementing the + // iterator does not work. When instructions get deleted + // re-iterate instead. + BI = BB->begin(); BE = BB->end(); + Changed |= res; + } } // Place the list of instructions to simplify on the next loop iteration diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 15b3e66..3b61bb5 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/Triple.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" @@ -43,7 +44,7 @@ namespace { class LibCallOptimization { protected: Function *Caller; - const DataLayout *TD; + const DataLayout *DL; const TargetLibraryInfo *TLI; const LibCallSimplifier *LCS; LLVMContext* Context; @@ -63,11 +64,11 @@ public: /// change the calling convention. virtual bool ignoreCallingConv() { return false; } - Value *optimizeCall(CallInst *CI, const DataLayout *TD, + Value *optimizeCall(CallInst *CI, const DataLayout *DL, const TargetLibraryInfo *TLI, const LibCallSimplifier *LCS, IRBuilder<> &B) { Caller = CI->getParent()->getParent(); - this->TD = TD; + this->DL = DL; this->TLI = TLI; this->LCS = LCS; if (CI->getCalledFunction()) @@ -75,7 +76,7 @@ public: // We never change the calling convention. if (!ignoreCallingConv() && CI->getCallingConv() != llvm::CallingConv::C) - return NULL; + return nullptr; return callOptimizer(CI->getCalledFunction(), CI, B); } @@ -88,9 +89,8 @@ public: /// isOnlyUsedInZeroEqualityComparison - Return true if it only matters that the /// value is equal or not-equal to zero. static bool isOnlyUsedInZeroEqualityComparison(Value *V) { - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); - UI != E; ++UI) { - if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI)) + for (User *U : V->users()) { + if (ICmpInst *IC = dyn_cast<ICmpInst>(U)) if (IC->isEquality()) if (Constant *C = dyn_cast<Constant>(IC->getOperand(1))) if (C->isNullValue()) @@ -104,9 +104,8 @@ static bool isOnlyUsedInZeroEqualityComparison(Value *V) { /// isOnlyUsedInEqualityComparison - Return true if it is only used in equality /// comparisons with With. static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) { - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); - UI != E; ++UI) { - if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI)) + for (User *U : V->users()) { + if (ICmpInst *IC = dyn_cast<ICmpInst>(U)) if (IC->isEquality() && IC->getOperand(1) == With) continue; // Unknown instruction. @@ -152,7 +151,8 @@ protected: struct InstFortifiedLibCallOptimization : public FortifiedLibCallOptimization { CallInst *CI; - bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp, bool isString) const { + bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp, + bool isString) const override { if (CI->getArgOperand(SizeCIOp) == CI->getArgOperand(SizeArgOp)) return true; if (ConstantInt *SizeCI = @@ -175,7 +175,8 @@ struct InstFortifiedLibCallOptimization : public FortifiedLibCallOptimization { }; struct MemCpyChkOpt : public InstFortifiedLibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { this->CI = CI; FunctionType *FT = Callee->getFunctionType(); LLVMContext &Context = CI->getParent()->getContext(); @@ -184,21 +185,22 @@ struct MemCpyChkOpt : public InstFortifiedLibCallOptimization { if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || - FT->getParamType(2) != TD->getIntPtrType(Context) || - FT->getParamType(3) != TD->getIntPtrType(Context)) - return 0; + FT->getParamType(2) != DL->getIntPtrType(Context) || + FT->getParamType(3) != DL->getIntPtrType(Context)) + return nullptr; if (isFoldable(3, 2, false)) { B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), 1); return CI->getArgOperand(0); } - return 0; + return nullptr; } }; struct MemMoveChkOpt : public InstFortifiedLibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { this->CI = CI; FunctionType *FT = Callee->getFunctionType(); LLVMContext &Context = CI->getParent()->getContext(); @@ -207,21 +209,22 @@ struct MemMoveChkOpt : public InstFortifiedLibCallOptimization { if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || - FT->getParamType(2) != TD->getIntPtrType(Context) || - FT->getParamType(3) != TD->getIntPtrType(Context)) - return 0; + FT->getParamType(2) != DL->getIntPtrType(Context) || + FT->getParamType(3) != DL->getIntPtrType(Context)) + return nullptr; if (isFoldable(3, 2, false)) { B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), 1); return CI->getArgOperand(0); } - return 0; + return nullptr; } }; struct MemSetChkOpt : public InstFortifiedLibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { this->CI = CI; FunctionType *FT = Callee->getFunctionType(); LLVMContext &Context = CI->getParent()->getContext(); @@ -230,9 +233,9 @@ struct MemSetChkOpt : public InstFortifiedLibCallOptimization { if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isIntegerTy() || - FT->getParamType(2) != TD->getIntPtrType(Context) || - FT->getParamType(3) != TD->getIntPtrType(Context)) - return 0; + FT->getParamType(2) != DL->getIntPtrType(Context) || + FT->getParamType(3) != DL->getIntPtrType(Context)) + return nullptr; if (isFoldable(3, 2, false)) { Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), @@ -240,12 +243,13 @@ struct MemSetChkOpt : public InstFortifiedLibCallOptimization { B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); return CI->getArgOperand(0); } - return 0; + return nullptr; } }; struct StrCpyChkOpt : public InstFortifiedLibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { this->CI = CI; StringRef Name = Callee->getName(); FunctionType *FT = Callee->getFunctionType(); @@ -256,8 +260,8 @@ struct StrCpyChkOpt : public InstFortifiedLibCallOptimization { FT->getReturnType() != FT->getParamType(0) || FT->getParamType(0) != FT->getParamType(1) || FT->getParamType(0) != Type::getInt8PtrTy(Context) || - FT->getParamType(2) != TD->getIntPtrType(Context)) - return 0; + FT->getParamType(2) != DL->getIntPtrType(Context)) + return nullptr; Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); if (Dst == Src) // __strcpy_chk(x,x) -> x @@ -269,28 +273,29 @@ struct StrCpyChkOpt : public InstFortifiedLibCallOptimization { // TODO: It might be nice to get a maximum length out of the possible // string lengths for varying. if (isFoldable(2, 1, true)) { - Value *Ret = EmitStrCpy(Dst, Src, B, TD, TLI, Name.substr(2, 6)); + Value *Ret = EmitStrCpy(Dst, Src, B, DL, TLI, Name.substr(2, 6)); return Ret; } else { // Maybe we can stil fold __strcpy_chk to __memcpy_chk. uint64_t Len = GetStringLength(Src); - if (Len == 0) return 0; + if (Len == 0) return nullptr; // This optimization require DataLayout. - if (!TD) return 0; + if (!DL) return nullptr; Value *Ret = EmitMemCpyChk(Dst, Src, - ConstantInt::get(TD->getIntPtrType(Context), Len), - CI->getArgOperand(2), B, TD, TLI); + ConstantInt::get(DL->getIntPtrType(Context), Len), + CI->getArgOperand(2), B, DL, TLI); return Ret; } - return 0; + return nullptr; } }; struct StpCpyChkOpt : public InstFortifiedLibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { this->CI = CI; StringRef Name = Callee->getName(); FunctionType *FT = Callee->getFunctionType(); @@ -301,13 +306,13 @@ struct StpCpyChkOpt : public InstFortifiedLibCallOptimization { FT->getReturnType() != FT->getParamType(0) || FT->getParamType(0) != FT->getParamType(1) || FT->getParamType(0) != Type::getInt8PtrTy(Context) || - FT->getParamType(2) != TD->getIntPtrType(FT->getParamType(0))) - return 0; + FT->getParamType(2) != DL->getIntPtrType(FT->getParamType(0))) + return nullptr; Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x) - Value *StrLen = EmitStrLen(Src, B, TD, TLI); - return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0; + Value *StrLen = EmitStrLen(Src, B, DL, TLI); + return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : nullptr; } // If a) we don't have any length information, or b) we know this will @@ -316,31 +321,32 @@ struct StpCpyChkOpt : public InstFortifiedLibCallOptimization { // TODO: It might be nice to get a maximum length out of the possible // string lengths for varying. if (isFoldable(2, 1, true)) { - Value *Ret = EmitStrCpy(Dst, Src, B, TD, TLI, Name.substr(2, 6)); + Value *Ret = EmitStrCpy(Dst, Src, B, DL, TLI, Name.substr(2, 6)); return Ret; } else { // Maybe we can stil fold __stpcpy_chk to __memcpy_chk. uint64_t Len = GetStringLength(Src); - if (Len == 0) return 0; + if (Len == 0) return nullptr; // This optimization require DataLayout. - if (!TD) return 0; + if (!DL) return nullptr; Type *PT = FT->getParamType(0); - Value *LenV = ConstantInt::get(TD->getIntPtrType(PT), Len); + Value *LenV = ConstantInt::get(DL->getIntPtrType(PT), Len); Value *DstEnd = B.CreateGEP(Dst, - ConstantInt::get(TD->getIntPtrType(PT), + ConstantInt::get(DL->getIntPtrType(PT), Len - 1)); - if (!EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, TD, TLI)) - return 0; + if (!EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, DL, TLI)) + return nullptr; return DstEnd; } - return 0; + return nullptr; } }; struct StrNCpyChkOpt : public InstFortifiedLibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { this->CI = CI; StringRef Name = Callee->getName(); FunctionType *FT = Callee->getFunctionType(); @@ -351,16 +357,16 @@ struct StrNCpyChkOpt : public InstFortifiedLibCallOptimization { FT->getParamType(0) != FT->getParamType(1) || FT->getParamType(0) != Type::getInt8PtrTy(Context) || !FT->getParamType(2)->isIntegerTy() || - FT->getParamType(3) != TD->getIntPtrType(Context)) - return 0; + FT->getParamType(3) != DL->getIntPtrType(Context)) + return nullptr; if (isFoldable(3, 2, false)) { Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), B, TD, TLI, + CI->getArgOperand(2), B, DL, TLI, Name.substr(2, 7)); return Ret; } - return 0; + return nullptr; } }; @@ -369,14 +375,15 @@ struct StrNCpyChkOpt : public InstFortifiedLibCallOptimization { //===----------------------------------------------------------------------===// struct StrCatOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { // Verify the "strcat" function prototype. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getReturnType() != B.getInt8PtrTy() || FT->getParamType(0) != FT->getReturnType() || FT->getParamType(1) != FT->getReturnType()) - return 0; + return nullptr; // Extract some information from the instruction Value *Dst = CI->getArgOperand(0); @@ -384,7 +391,7 @@ struct StrCatOpt : public LibCallOptimization { // See if we can get the length of the input string. uint64_t Len = GetStringLength(Src); - if (Len == 0) return 0; + if (Len == 0) return nullptr; --Len; // Unbias length. // Handle the simple, do-nothing case: strcat(x, "") -> x @@ -392,7 +399,7 @@ struct StrCatOpt : public LibCallOptimization { return Dst; // These optimizations require DataLayout. - if (!TD) return 0; + if (!DL) return nullptr; return emitStrLenMemCpy(Src, Dst, Len, B); } @@ -401,9 +408,9 @@ struct StrCatOpt : public LibCallOptimization { IRBuilder<> &B) { // We need to find the end of the destination string. That's where the // memory is to be moved to. We just generate a call to strlen. - Value *DstLen = EmitStrLen(Dst, B, TD, TLI); + Value *DstLen = EmitStrLen(Dst, B, DL, TLI); if (!DstLen) - return 0; + return nullptr; // Now that we have the destination's length, we must index into the // destination's pointer to get the actual memcpy destination (end of @@ -413,13 +420,14 @@ struct StrCatOpt : public LibCallOptimization { // We have enough information to now generate the memcpy call to do the // concatenation for us. Make a memcpy to copy the nul byte with align = 1. B.CreateMemCpy(CpyDst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len + 1), 1); + ConstantInt::get(DL->getIntPtrType(*Context), Len + 1), 1); return Dst; } }; struct StrNCatOpt : public StrCatOpt { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { // Verify the "strncat" function prototype. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || @@ -427,7 +435,7 @@ struct StrNCatOpt : public StrCatOpt { FT->getParamType(0) != FT->getReturnType() || FT->getParamType(1) != FT->getReturnType() || !FT->getParamType(2)->isIntegerTy()) - return 0; + return nullptr; // Extract some information from the instruction Value *Dst = CI->getArgOperand(0); @@ -438,11 +446,11 @@ struct StrNCatOpt : public StrCatOpt { if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2))) Len = LengthArg->getZExtValue(); else - return 0; + return nullptr; // See if we can get the length of the input string. uint64_t SrcLen = GetStringLength(Src); - if (SrcLen == 0) return 0; + if (SrcLen == 0) return nullptr; --SrcLen; // Unbias length. // Handle the simple, do-nothing cases: @@ -451,10 +459,10 @@ struct StrNCatOpt : public StrCatOpt { if (SrcLen == 0 || Len == 0) return Dst; // These optimizations require DataLayout. - if (!TD) return 0; + if (!DL) return nullptr; // We don't optimize this case - if (Len < SrcLen) return 0; + if (Len < SrcLen) return nullptr; // strncat(x, s, c) -> strcat(x, s) // s is constant so the strcat can be optimized further @@ -463,38 +471,42 @@ struct StrNCatOpt : public StrCatOpt { }; struct StrChrOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { // Verify the "strchr" function prototype. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getReturnType() != B.getInt8PtrTy() || FT->getParamType(0) != FT->getReturnType() || !FT->getParamType(1)->isIntegerTy(32)) - return 0; + return nullptr; Value *SrcStr = CI->getArgOperand(0); // If the second operand is non-constant, see if we can compute the length // of the input string and turn this into memchr. ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); - if (CharC == 0) { + if (!CharC) { // These optimizations require DataLayout. - if (!TD) return 0; + if (!DL) return nullptr; uint64_t Len = GetStringLength(SrcStr); if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32))// memchr needs i32. - return 0; + return nullptr; return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul. - ConstantInt::get(TD->getIntPtrType(*Context), Len), - B, TD, TLI); + ConstantInt::get(DL->getIntPtrType(*Context), Len), + B, DL, TLI); } // Otherwise, the character is a constant, see if the first argument is // a string literal. If so, we can constant fold. StringRef Str; - if (!getConstantStringInfo(SrcStr, Str)) - return 0; + if (!getConstantStringInfo(SrcStr, Str)) { + if (DL && CharC->isZero()) // strchr(p, 0) -> p + strlen(p) + return B.CreateGEP(SrcStr, EmitStrLen(SrcStr, B, DL, TLI), "strchr"); + return nullptr; + } // Compute the offset, make sure to handle the case when we're searching for // zero (a weird way to spell strlen). @@ -509,28 +521,29 @@ struct StrChrOpt : public LibCallOptimization { }; struct StrRChrOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { // Verify the "strrchr" function prototype. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getReturnType() != B.getInt8PtrTy() || FT->getParamType(0) != FT->getReturnType() || !FT->getParamType(1)->isIntegerTy(32)) - return 0; + return nullptr; Value *SrcStr = CI->getArgOperand(0); ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); // Cannot fold anything if we're not looking for a constant. if (!CharC) - return 0; + return nullptr; StringRef Str; if (!getConstantStringInfo(SrcStr, Str)) { // strrchr(s, 0) -> strchr(s, 0) - if (TD && CharC->isZero()) - return EmitStrChr(SrcStr, '\0', B, TD, TLI); - return 0; + if (DL && CharC->isZero()) + return EmitStrChr(SrcStr, '\0', B, DL, TLI); + return nullptr; } // Compute the offset. @@ -545,14 +558,15 @@ struct StrRChrOpt : public LibCallOptimization { }; struct StrCmpOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { // Verify the "strcmp" function prototype. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || !FT->getReturnType()->isIntegerTy(32) || FT->getParamType(0) != FT->getParamType(1) || FT->getParamType(0) != B.getInt8PtrTy()) - return 0; + return nullptr; Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); if (Str1P == Str2P) // strcmp(x,x) -> 0 @@ -578,19 +592,20 @@ struct StrCmpOpt : public LibCallOptimization { uint64_t Len2 = GetStringLength(Str2P); if (Len1 && Len2) { // These optimizations require DataLayout. - if (!TD) return 0; + if (!DL) return nullptr; return EmitMemCmp(Str1P, Str2P, - ConstantInt::get(TD->getIntPtrType(*Context), - std::min(Len1, Len2)), B, TD, TLI); + ConstantInt::get(DL->getIntPtrType(*Context), + std::min(Len1, Len2)), B, DL, TLI); } - return 0; + return nullptr; } }; struct StrNCmpOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { // Verify the "strncmp" function prototype. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || @@ -598,7 +613,7 @@ struct StrNCmpOpt : public LibCallOptimization { FT->getParamType(0) != FT->getParamType(1) || FT->getParamType(0) != B.getInt8PtrTy() || !FT->getParamType(2)->isIntegerTy()) - return 0; + return nullptr; Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); if (Str1P == Str2P) // strncmp(x,x,n) -> 0 @@ -609,13 +624,13 @@ struct StrNCmpOpt : public LibCallOptimization { if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2))) Length = LengthArg->getZExtValue(); else - return 0; + return nullptr; if (Length == 0) // strncmp(x,y,0) -> 0 return ConstantInt::get(CI->getType(), 0); - if (TD && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1) - return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD, TLI); + if (DL && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1) + return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, DL, TLI); StringRef Str1, Str2; bool HasStr1 = getConstantStringInfo(Str1P, Str1); @@ -635,66 +650,68 @@ struct StrNCmpOpt : public LibCallOptimization { if (HasStr2 && Str2.empty()) // strncmp(x, "", n) -> *x return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); - return 0; + return nullptr; } }; struct StrCpyOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { // Verify the "strcpy" function prototype. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || FT->getParamType(0) != FT->getParamType(1) || FT->getParamType(0) != B.getInt8PtrTy()) - return 0; + return nullptr; Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); if (Dst == Src) // strcpy(x,x) -> x return Src; // These optimizations require DataLayout. - if (!TD) return 0; + if (!DL) return nullptr; // See if we can get the length of the input string. uint64_t Len = GetStringLength(Src); - if (Len == 0) return 0; + if (Len == 0) return nullptr; // We have enough information to now generate the memcpy call to do the // copy for us. Make a memcpy to copy the nul byte with align = 1. B.CreateMemCpy(Dst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); + ConstantInt::get(DL->getIntPtrType(*Context), Len), 1); return Dst; } }; struct StpCpyOpt: public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { // Verify the "stpcpy" function prototype. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || FT->getParamType(0) != FT->getParamType(1) || FT->getParamType(0) != B.getInt8PtrTy()) - return 0; + return nullptr; // These optimizations require DataLayout. - if (!TD) return 0; + if (!DL) return nullptr; Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x) - Value *StrLen = EmitStrLen(Src, B, TD, TLI); - return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0; + Value *StrLen = EmitStrLen(Src, B, DL, TLI); + return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : nullptr; } // See if we can get the length of the input string. uint64_t Len = GetStringLength(Src); - if (Len == 0) return 0; + if (Len == 0) return nullptr; Type *PT = FT->getParamType(0); - Value *LenV = ConstantInt::get(TD->getIntPtrType(PT), Len); + Value *LenV = ConstantInt::get(DL->getIntPtrType(PT), Len); Value *DstEnd = B.CreateGEP(Dst, - ConstantInt::get(TD->getIntPtrType(PT), + ConstantInt::get(DL->getIntPtrType(PT), Len - 1)); // We have enough information to now generate the memcpy call to do the @@ -705,13 +722,14 @@ struct StpCpyOpt: public LibCallOptimization { }; struct StrNCpyOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || FT->getParamType(0) != FT->getParamType(1) || FT->getParamType(0) != B.getInt8PtrTy() || !FT->getParamType(2)->isIntegerTy()) - return 0; + return nullptr; Value *Dst = CI->getArgOperand(0); Value *Src = CI->getArgOperand(1); @@ -719,7 +737,7 @@ struct StrNCpyOpt : public LibCallOptimization { // See if we can get the length of the input string. uint64_t SrcLen = GetStringLength(Src); - if (SrcLen == 0) return 0; + if (SrcLen == 0) return nullptr; --SrcLen; if (SrcLen == 0) { @@ -732,33 +750,34 @@ struct StrNCpyOpt : public LibCallOptimization { if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp)) Len = LengthArg->getZExtValue(); else - return 0; + return nullptr; if (Len == 0) return Dst; // strncpy(x, y, 0) -> x // These optimizations require DataLayout. - if (!TD) return 0; + if (!DL) return nullptr; // Let strncpy handle the zero padding - if (Len > SrcLen+1) return 0; + if (Len > SrcLen+1) return nullptr; Type *PT = FT->getParamType(0); // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant] B.CreateMemCpy(Dst, Src, - ConstantInt::get(TD->getIntPtrType(PT), Len), 1); + ConstantInt::get(DL->getIntPtrType(PT), Len), 1); return Dst; } }; struct StrLenOpt : public LibCallOptimization { - virtual bool ignoreCallingConv() { return true; } - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + bool ignoreCallingConv() override { return true; } + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 1 || FT->getParamType(0) != B.getInt8PtrTy() || !FT->getReturnType()->isIntegerTy()) - return 0; + return nullptr; Value *Src = CI->getArgOperand(0); @@ -766,22 +785,38 @@ struct StrLenOpt : public LibCallOptimization { if (uint64_t Len = GetStringLength(Src)) return ConstantInt::get(CI->getType(), Len-1); + // strlen(x?"foo":"bars") --> x ? 3 : 4 + if (SelectInst *SI = dyn_cast<SelectInst>(Src)) { + uint64_t LenTrue = GetStringLength(SI->getTrueValue()); + uint64_t LenFalse = GetStringLength(SI->getFalseValue()); + if (LenTrue && LenFalse) { + emitOptimizationRemark(*Context, "simplify-libcalls", *Caller, + SI->getDebugLoc(), + "folded strlen(select) to select of constants"); + return B.CreateSelect(SI->getCondition(), + ConstantInt::get(CI->getType(), LenTrue-1), + ConstantInt::get(CI->getType(), LenFalse-1)); + } + } + // strlen(x) != 0 --> *x != 0 // strlen(x) == 0 --> *x == 0 if (isOnlyUsedInZeroEqualityComparison(CI)) return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType()); - return 0; + + return nullptr; } }; struct StrPBrkOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() || FT->getParamType(1) != FT->getParamType(0) || FT->getReturnType() != FT->getParamType(0)) - return 0; + return nullptr; StringRef S1, S2; bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); @@ -802,20 +837,21 @@ struct StrPBrkOpt : public LibCallOptimization { } // strpbrk(s, "a") -> strchr(s, 'a') - if (TD && HasS2 && S2.size() == 1) - return EmitStrChr(CI->getArgOperand(0), S2[0], B, TD, TLI); + if (DL && HasS2 && S2.size() == 1) + return EmitStrChr(CI->getArgOperand(0), S2[0], B, DL, TLI); - return 0; + return nullptr; } }; struct StrToOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { FunctionType *FT = Callee->getFunctionType(); if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy()) - return 0; + return nullptr; Value *EndPtr = CI->getArgOperand(1); if (isa<ConstantPointerNull>(EndPtr)) { @@ -824,18 +860,19 @@ struct StrToOpt : public LibCallOptimization { CI->addAttribute(1, Attribute::NoCapture); } - return 0; + return nullptr; } }; struct StrSpnOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() || FT->getParamType(1) != FT->getParamType(0) || !FT->getReturnType()->isIntegerTy()) - return 0; + return nullptr; StringRef S1, S2; bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); @@ -853,18 +890,19 @@ struct StrSpnOpt : public LibCallOptimization { return ConstantInt::get(CI->getType(), Pos); } - return 0; + return nullptr; } }; struct StrCSpnOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() || FT->getParamType(1) != FT->getParamType(0) || !FT->getReturnType()->isIntegerTy()) - return 0; + return nullptr; StringRef S1, S2; bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); @@ -882,37 +920,37 @@ struct StrCSpnOpt : public LibCallOptimization { } // strcspn(s, "") -> strlen(s) - if (TD && HasS2 && S2.empty()) - return EmitStrLen(CI->getArgOperand(0), B, TD, TLI); + if (DL && HasS2 && S2.empty()) + return EmitStrLen(CI->getArgOperand(0), B, DL, TLI); - return 0; + return nullptr; } }; struct StrStrOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || !FT->getReturnType()->isPointerTy()) - return 0; + return nullptr; // fold strstr(x, x) -> x. if (CI->getArgOperand(0) == CI->getArgOperand(1)) return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0 - if (TD && isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) { - Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD, TLI); + if (DL && isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) { + Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, DL, TLI); if (!StrLen) - return 0; + return nullptr; Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1), - StrLen, B, TD, TLI); + StrLen, B, DL, TLI); if (!StrNCmp) - return 0; - for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end(); - UI != UE; ) { + return nullptr; + for (auto UI = CI->user_begin(), UE = CI->user_end(); UI != UE;) { ICmpInst *Old = cast<ICmpInst>(*UI++); Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp, ConstantInt::getNullValue(StrNCmp->getType()), @@ -946,20 +984,21 @@ struct StrStrOpt : public LibCallOptimization { // fold strstr(x, "y") -> strchr(x, 'y'). if (HasStr2 && ToFindStr.size() == 1) { - Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD, TLI); - return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : 0; + Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, DL, TLI); + return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : nullptr; } - return 0; + return nullptr; } }; struct MemCmpOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || !FT->getReturnType()->isIntegerTy(32)) - return 0; + return nullptr; Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1); @@ -968,7 +1007,7 @@ struct MemCmpOpt : public LibCallOptimization { // Make sure we have a constant length. ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2)); - if (!LenC) return 0; + if (!LenC) return nullptr; uint64_t Len = LenC->getZExtValue(); if (Len == 0) // memcmp(s1,s2,0) -> 0 @@ -989,7 +1028,7 @@ struct MemCmpOpt : public LibCallOptimization { getConstantStringInfo(RHS, RHSStr)) { // Make sure we're not reading out-of-bounds memory. if (Len > LHSStr.size() || Len > RHSStr.size()) - return 0; + return nullptr; // Fold the memcmp and normalize the result. This way we get consistent // results across multiple platforms. uint64_t Ret = 0; @@ -1001,21 +1040,22 @@ struct MemCmpOpt : public LibCallOptimization { return ConstantInt::get(CI->getType(), Ret); } - return 0; + return nullptr; } }; struct MemCpyOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { // These optimizations require DataLayout. - if (!TD) return 0; + if (!DL) return nullptr; FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || - FT->getParamType(2) != TD->getIntPtrType(*Context)) - return 0; + FT->getParamType(2) != DL->getIntPtrType(*Context)) + return nullptr; // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1) B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), @@ -1025,16 +1065,17 @@ struct MemCpyOpt : public LibCallOptimization { }; struct MemMoveOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { // These optimizations require DataLayout. - if (!TD) return 0; + if (!DL) return nullptr; FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || - FT->getParamType(2) != TD->getIntPtrType(*Context)) - return 0; + FT->getParamType(2) != DL->getIntPtrType(*Context)) + return nullptr; // memmove(x, y, n) -> llvm.memmove(x, y, n, 1) B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1), @@ -1044,16 +1085,17 @@ struct MemMoveOpt : public LibCallOptimization { }; struct MemSetOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { // These optimizations require DataLayout. - if (!TD) return 0; + if (!DL) return nullptr; FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isIntegerTy() || - FT->getParamType(2) != TD->getIntPtrType(FT->getParamType(0))) - return 0; + FT->getParamType(2) != DL->getIntPtrType(FT->getParamType(0))) + return nullptr; // memset(p, v, n) -> llvm.memset(p, v, n, 1) Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); @@ -1072,26 +1114,26 @@ struct MemSetOpt : public LibCallOptimization { struct UnaryDoubleFPOpt : public LibCallOptimization { bool CheckRetType; UnaryDoubleFPOpt(bool CheckReturnType): CheckRetType(CheckReturnType) {} - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() || !FT->getParamType(0)->isDoubleTy()) - return 0; + return nullptr; if (CheckRetType) { // Check if all the uses for function like 'sin' are converted to float. - for (Value::use_iterator UseI = CI->use_begin(); UseI != CI->use_end(); - ++UseI) { - FPTruncInst *Cast = dyn_cast<FPTruncInst>(*UseI); - if (Cast == 0 || !Cast->getType()->isFloatTy()) - return 0; + for (User *U : CI->users()) { + FPTruncInst *Cast = dyn_cast<FPTruncInst>(U); + if (!Cast || !Cast->getType()->isFloatTy()) + return nullptr; } } // If this is something like 'floor((double)floatval)', convert to floorf. FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0)); - if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy()) - return 0; + if (!Cast || !Cast->getOperand(0)->getType()->isFloatTy()) + return nullptr; // floor((double)floatval) -> (double)floorf(floatval) Value *V = Cast->getOperand(0); @@ -1100,6 +1142,49 @@ struct UnaryDoubleFPOpt : public LibCallOptimization { } }; +// Double -> Float Shrinking Optimizations for Binary Functions like 'fmin/fmax' +struct BinaryDoubleFPOpt : public LibCallOptimization { + bool CheckRetType; + BinaryDoubleFPOpt(bool CheckReturnType): CheckRetType(CheckReturnType) {} + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { + FunctionType *FT = Callee->getFunctionType(); + // Just make sure this has 2 arguments of the same FP type, which match the + // result type. + if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + !FT->getParamType(0)->isFloatingPointTy()) + return nullptr; + + if (CheckRetType) { + // Check if all the uses for function like 'fmin/fmax' are converted to + // float. + for (User *U : CI->users()) { + FPTruncInst *Cast = dyn_cast<FPTruncInst>(U); + if (!Cast || !Cast->getType()->isFloatTy()) + return nullptr; + } + } + + // If this is something like 'fmin((double)floatval1, (double)floatval2)', + // we convert it to fminf. + FPExtInst *Cast1 = dyn_cast<FPExtInst>(CI->getArgOperand(0)); + FPExtInst *Cast2 = dyn_cast<FPExtInst>(CI->getArgOperand(1)); + if (!Cast1 || !Cast1->getOperand(0)->getType()->isFloatTy() || + !Cast2 || !Cast2->getOperand(0)->getType()->isFloatTy()) + return nullptr; + + // fmin((double)floatval1, (double)floatval2) + // -> (double)fmin(floatval1, floatval2) + Value *V = nullptr; + Value *V1 = Cast1->getOperand(0); + Value *V2 = Cast2->getOperand(0); + V = EmitBinaryFloatFnCall(V1, V2, Callee->getName(), B, + Callee->getAttributes()); + return B.CreateFPExt(V, B.getDoubleTy()); + } +}; + struct UnsafeFPLibCallOptimization : public LibCallOptimization { bool UnsafeFPShrink; UnsafeFPLibCallOptimization(bool UnsafeFPShrink) { @@ -1109,8 +1194,9 @@ struct UnsafeFPLibCallOptimization : public LibCallOptimization { struct CosOpt : public UnsafeFPLibCallOptimization { CosOpt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {} - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - Value *Ret = NULL; + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { + Value *Ret = nullptr; if (UnsafeFPShrink && Callee->getName() == "cos" && TLI->has(LibFunc::cosf)) { UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); @@ -1136,8 +1222,9 @@ struct CosOpt : public UnsafeFPLibCallOptimization { struct PowOpt : public UnsafeFPLibCallOptimization { PowOpt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {} - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - Value *Ret = NULL; + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { + Value *Ret = nullptr; if (UnsafeFPShrink && Callee->getName() == "pow" && TLI->has(LibFunc::powf)) { UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); @@ -1162,10 +1249,16 @@ struct PowOpt : public UnsafeFPLibCallOptimization { hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp2, LibFunc::exp2f, LibFunc::exp2l)) return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes()); + // pow(10.0, x) -> exp10(x) + if (Op1C->isExactlyValue(10.0) && + hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp10, LibFunc::exp10f, + LibFunc::exp10l)) + return EmitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp10), B, + Callee->getAttributes()); } ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2); - if (Op2C == 0) return Ret; + if (!Op2C) return Ret; if (Op2C->getValueAPF().isZero()) // pow(x, 0.0) -> 1.0 return ConstantFP::get(CI->getType(), 1.0); @@ -1198,14 +1291,15 @@ struct PowOpt : public UnsafeFPLibCallOptimization { if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip"); - return 0; + return nullptr; } }; struct Exp2Opt : public UnsafeFPLibCallOptimization { Exp2Opt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {} - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - Value *Ret = NULL; + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { + Value *Ret = nullptr; if (UnsafeFPShrink && Callee->getName() == "exp2" && TLI->has(LibFunc::exp2f)) { UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); @@ -1222,37 +1316,37 @@ struct Exp2Opt : public UnsafeFPLibCallOptimization { Value *Op = CI->getArgOperand(0); // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x)) if sizeof(x) <= 32 // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x)) if sizeof(x) < 32 - Value *LdExpArg = 0; - if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) { - if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32) - LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty()); - } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) { - if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32) - LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty()); - } + LibFunc::Func LdExp = LibFunc::ldexpl; + if (Op->getType()->isFloatTy()) + LdExp = LibFunc::ldexpf; + else if (Op->getType()->isDoubleTy()) + LdExp = LibFunc::ldexp; + + if (TLI->has(LdExp)) { + Value *LdExpArg = nullptr; + if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) { + if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32) + LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty()); + } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) { + if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32) + LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty()); + } - if (LdExpArg) { - const char *Name; - if (Op->getType()->isFloatTy()) - Name = "ldexpf"; - else if (Op->getType()->isDoubleTy()) - Name = "ldexp"; - else - Name = "ldexpl"; - - Constant *One = ConstantFP::get(*Context, APFloat(1.0f)); - if (!Op->getType()->isFloatTy()) - One = ConstantExpr::getFPExtend(One, Op->getType()); - - Module *M = Caller->getParent(); - Value *Callee = M->getOrInsertFunction(Name, Op->getType(), - Op->getType(), - B.getInt32Ty(), NULL); - CallInst *CI = B.CreateCall2(Callee, One, LdExpArg); - if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts())) - CI->setCallingConv(F->getCallingConv()); + if (LdExpArg) { + Constant *One = ConstantFP::get(*Context, APFloat(1.0f)); + if (!Op->getType()->isFloatTy()) + One = ConstantExpr::getFPExtend(One, Op->getType()); - return CI; + Module *M = Caller->getParent(); + Value *Callee = + M->getOrInsertFunction(TLI->getName(LdExp), Op->getType(), + Op->getType(), B.getInt32Ty(), NULL); + CallInst *CI = B.CreateCall2(Callee, One, LdExpArg); + if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts())) + CI->setCallingConv(F->getCallingConv()); + + return CI; + } } return Ret; } @@ -1261,11 +1355,12 @@ struct Exp2Opt : public UnsafeFPLibCallOptimization { struct SinCosPiOpt : public LibCallOptimization { SinCosPiOpt() {} - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { // Make sure the prototype is as expected, otherwise the rest of the // function is probably invalid and likely to abort. if (!isTrigLibCall(CI)) - return 0; + return nullptr; Value *Arg = CI->getArgOperand(0); SmallVector<CallInst *, 1> SinCalls; @@ -1277,14 +1372,13 @@ struct SinCosPiOpt : public LibCallOptimization { // Look for all compatible sinpi, cospi and sincospi calls with the same // argument. If there are enough (in some sense) we can make the // substitution. - for (Value::use_iterator UI = Arg->use_begin(), UE = Arg->use_end(); - UI != UE; ++UI) - classifyArgUse(*UI, CI->getParent(), IsFloat, SinCalls, CosCalls, + for (User *U : Arg->users()) + classifyArgUse(U, CI->getParent(), IsFloat, SinCalls, CosCalls, SinCosCalls); // It's only worthwhile if both sinpi and cospi are actually used. if (SinCosCalls.empty() && (SinCalls.empty() || CosCalls.empty())) - return 0; + return nullptr; Value *Sin, *Cos, *SinCos; insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos, @@ -1294,7 +1388,7 @@ struct SinCosPiOpt : public LibCallOptimization { replaceTrigInsts(CosCalls, Cos); replaceTrigInsts(SinCosCalls, SinCos); - return 0; + return nullptr; } bool isTrigLibCall(CallInst *CI) { @@ -1334,7 +1428,7 @@ struct SinCosPiOpt : public LibCallOptimization { SinCalls.push_back(CI); else if (Func == LibFunc::cospif) CosCalls.push_back(CI); - else if (Func == LibFunc::sincospi_stretf) + else if (Func == LibFunc::sincospif_stret) SinCosCalls.push_back(CI); } else { if (Func == LibFunc::sinpi) @@ -1363,7 +1457,7 @@ struct SinCosPiOpt : public LibCallOptimization { Triple T(OrigCallee->getParent()->getTargetTriple()); if (UseFloat) { - Name = "__sincospi_stretf"; + Name = "__sincospif_stret"; assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now"); // x86_64 can't use {float, float} since that would be returned in both @@ -1412,14 +1506,15 @@ struct SinCosPiOpt : public LibCallOptimization { //===----------------------------------------------------------------------===// struct FFSOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 2 arguments of the same FP type, which match the // result type. if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy(32) || !FT->getParamType(0)->isIntegerTy()) - return 0; + return nullptr; Value *Op = CI->getArgOperand(0); @@ -1445,13 +1540,14 @@ struct FFSOpt : public LibCallOptimization { }; struct AbsOpt : public LibCallOptimization { - virtual bool ignoreCallingConv() { return true; } - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + bool ignoreCallingConv() override { return true; } + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { FunctionType *FT = Callee->getFunctionType(); // We require integer(integer) where the types agree. if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || FT->getParamType(0) != FT->getReturnType()) - return 0; + return nullptr; // abs(x) -> x >s -1 ? x : -x Value *Op = CI->getArgOperand(0); @@ -1463,12 +1559,13 @@ struct AbsOpt : public LibCallOptimization { }; struct IsDigitOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { FunctionType *FT = Callee->getFunctionType(); // We require integer(i32) if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || !FT->getParamType(0)->isIntegerTy(32)) - return 0; + return nullptr; // isdigit(c) -> (c-'0') <u 10 Value *Op = CI->getArgOperand(0); @@ -1479,12 +1576,13 @@ struct IsDigitOpt : public LibCallOptimization { }; struct IsAsciiOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { FunctionType *FT = Callee->getFunctionType(); // We require integer(i32) if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || !FT->getParamType(0)->isIntegerTy(32)) - return 0; + return nullptr; // isascii(c) -> c <u 128 Value *Op = CI->getArgOperand(0); @@ -1494,12 +1592,13 @@ struct IsAsciiOpt : public LibCallOptimization { }; struct ToAsciiOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { FunctionType *FT = Callee->getFunctionType(); // We require i32(i32) if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isIntegerTy(32)) - return 0; + return nullptr; // toascii(c) -> c & 0x7f return B.CreateAnd(CI->getArgOperand(0), @@ -1514,7 +1613,8 @@ struct ToAsciiOpt : public LibCallOptimization { struct ErrorReportingOpt : public LibCallOptimization { ErrorReportingOpt(int S = -1) : StreamArg(S) {} - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &) override { // Error reporting calls should be cold, mark them as such. // This applies even to non-builtin calls: it is only a hint and applies to // functions that the frontend might not understand as builtins. @@ -1528,7 +1628,7 @@ struct ErrorReportingOpt : public LibCallOptimization { CI->addAttribute(AttributeSet::FunctionIndex, Attribute::Cold); } - return 0; + return nullptr; } protected: @@ -1565,7 +1665,7 @@ struct PrintFOpt : public LibCallOptimization { // Check for a fixed format string. StringRef FormatStr; if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr)) - return 0; + return nullptr; // Empty format string -> noop. if (FormatStr.empty()) // Tolerate printf's declared void. @@ -1576,11 +1676,11 @@ struct PrintFOpt : public LibCallOptimization { // is used, in general the printf return value is not compatible with either // putchar() or puts(). if (!CI->use_empty()) - return 0; + return nullptr; // printf("x") -> putchar('x'), even for '%'. if (FormatStr.size() == 1) { - Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TD, TLI); + Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, DL, TLI); if (CI->use_empty() || !Res) return Res; return B.CreateIntCast(Res, CI->getType(), true); } @@ -1592,7 +1692,7 @@ struct PrintFOpt : public LibCallOptimization { // pass to be run after this pass, to merge duplicate strings. FormatStr = FormatStr.drop_back(); Value *GV = B.CreateGlobalString(FormatStr, "str"); - Value *NewCI = EmitPutS(GV, B, TD, TLI); + Value *NewCI = EmitPutS(GV, B, DL, TLI); return (CI->use_empty() || !NewCI) ? NewCI : ConstantInt::get(CI->getType(), FormatStr.size()+1); @@ -1602,7 +1702,7 @@ struct PrintFOpt : public LibCallOptimization { // printf("%c", chr) --> putchar(chr) if (FormatStr == "%c" && CI->getNumArgOperands() > 1 && CI->getArgOperand(1)->getType()->isIntegerTy()) { - Value *Res = EmitPutChar(CI->getArgOperand(1), B, TD, TLI); + Value *Res = EmitPutChar(CI->getArgOperand(1), B, DL, TLI); if (CI->use_empty() || !Res) return Res; return B.CreateIntCast(Res, CI->getType(), true); @@ -1611,18 +1711,19 @@ struct PrintFOpt : public LibCallOptimization { // printf("%s\n", str) --> puts(str) if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 && CI->getArgOperand(1)->getType()->isPointerTy()) { - return EmitPutS(CI->getArgOperand(1), B, TD, TLI); + return EmitPutS(CI->getArgOperand(1), B, DL, TLI); } - return 0; + return nullptr; } - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { // Require one fixed pointer argument and an integer/void result. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() || !(FT->getReturnType()->isIntegerTy() || FT->getReturnType()->isVoidTy())) - return 0; + return nullptr; if (Value *V = optimizeFixedFormatString(Callee, CI, B)) { return V; @@ -1639,7 +1740,7 @@ struct PrintFOpt : public LibCallOptimization { B.Insert(New); return New; } - return 0; + return nullptr; } }; @@ -1649,7 +1750,7 @@ struct SPrintFOpt : public LibCallOptimization { // Check for a fixed format string. StringRef FormatStr; if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr)) - return 0; + return nullptr; // If we just have a format string (nothing else crazy) transform it. if (CI->getNumArgOperands() == 2) { @@ -1657,14 +1758,14 @@ struct SPrintFOpt : public LibCallOptimization { // %% -> % in the future if we cared. for (unsigned i = 0, e = FormatStr.size(); i != e; ++i) if (FormatStr[i] == '%') - return 0; // we found a format specifier, bail out. + return nullptr; // we found a format specifier, bail out. // These optimizations require DataLayout. - if (!TD) return 0; + if (!DL) return nullptr; // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1) B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), - ConstantInt::get(TD->getIntPtrType(*Context), // Copy the + ConstantInt::get(DL->getIntPtrType(*Context), // Copy the FormatStr.size() + 1), 1); // nul byte. return ConstantInt::get(CI->getType(), FormatStr.size()); } @@ -1673,12 +1774,12 @@ struct SPrintFOpt : public LibCallOptimization { // and have an extra operand. if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->getNumArgOperands() < 3) - return 0; + return nullptr; // Decode the second character of the format string. if (FormatStr[1] == 'c') { // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0 - if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0; + if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return nullptr; Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char"); Value *Ptr = CastToCStr(CI->getArgOperand(0), B); B.CreateStore(V, Ptr); @@ -1690,14 +1791,14 @@ struct SPrintFOpt : public LibCallOptimization { if (FormatStr[1] == 's') { // These optimizations require DataLayout. - if (!TD) return 0; + if (!DL) return nullptr; // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1) - if (!CI->getArgOperand(2)->getType()->isPointerTy()) return 0; + if (!CI->getArgOperand(2)->getType()->isPointerTy()) return nullptr; - Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD, TLI); + Value *Len = EmitStrLen(CI->getArgOperand(2), B, DL, TLI); if (!Len) - return 0; + return nullptr; Value *IncLen = B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc"); @@ -1706,16 +1807,17 @@ struct SPrintFOpt : public LibCallOptimization { // The sprintf result is the unincremented number of bytes in the string. return B.CreateIntCast(Len, CI->getType(), false); } - return 0; + return nullptr; } - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { // Require two fixed pointer arguments and an integer result. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || !FT->getReturnType()->isIntegerTy()) - return 0; + return nullptr; if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) { return V; @@ -1732,7 +1834,7 @@ struct SPrintFOpt : public LibCallOptimization { B.Insert(New); return New; } - return 0; + return nullptr; } }; @@ -1745,58 +1847,59 @@ struct FPrintFOpt : public LibCallOptimization { // All the optimizations depend on the format string. StringRef FormatStr; if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr)) - return 0; + return nullptr; // Do not do any of the following transformations if the fprintf return // value is used, in general the fprintf return value is not compatible // with fwrite(), fputc() or fputs(). if (!CI->use_empty()) - return 0; + return nullptr; // fprintf(F, "foo") --> fwrite("foo", 3, 1, F) if (CI->getNumArgOperands() == 2) { for (unsigned i = 0, e = FormatStr.size(); i != e; ++i) if (FormatStr[i] == '%') // Could handle %% -> % if we cared. - return 0; // We found a format specifier. + return nullptr; // We found a format specifier. // These optimizations require DataLayout. - if (!TD) return 0; + if (!DL) return nullptr; return EmitFWrite(CI->getArgOperand(1), - ConstantInt::get(TD->getIntPtrType(*Context), + ConstantInt::get(DL->getIntPtrType(*Context), FormatStr.size()), - CI->getArgOperand(0), B, TD, TLI); + CI->getArgOperand(0), B, DL, TLI); } // The remaining optimizations require the format string to be "%s" or "%c" // and have an extra operand. if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->getNumArgOperands() < 3) - return 0; + return nullptr; // Decode the second character of the format string. if (FormatStr[1] == 'c') { // fprintf(F, "%c", chr) --> fputc(chr, F) - if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0; - return EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI); + if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return nullptr; + return EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, DL, TLI); } if (FormatStr[1] == 's') { // fprintf(F, "%s", str) --> fputs(str, F) if (!CI->getArgOperand(2)->getType()->isPointerTy()) - return 0; - return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI); + return nullptr; + return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, DL, TLI); } - return 0; + return nullptr; } - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { // Require two fixed paramters as pointers and integer result. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || !FT->getReturnType()->isIntegerTy()) - return 0; + return nullptr; if (Value *V = optimizeFixedFormatString(Callee, CI, B)) { return V; @@ -1813,12 +1916,13 @@ struct FPrintFOpt : public LibCallOptimization { B.Insert(New); return New; } - return 0; + return nullptr; } }; struct FWriteOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { ErrorReportingOpt ER(/* StreamArg = */ 3); (void) ER.callOptimizer(Callee, CI, B); @@ -1829,12 +1933,12 @@ struct FWriteOpt : public LibCallOptimization { !FT->getParamType(2)->isIntegerTy() || !FT->getParamType(3)->isPointerTy() || !FT->getReturnType()->isIntegerTy()) - return 0; + return nullptr; // Get the element size and count. ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2)); - if (!SizeC || !CountC) return 0; + if (!SizeC || !CountC) return nullptr; uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue(); // If this is writing zero records, remove the call (it's a noop). @@ -1845,61 +1949,63 @@ struct FWriteOpt : public LibCallOptimization { // This optimisation is only valid, if the return value is unused. if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F) Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char"); - Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TD, TLI); - return NewCI ? ConstantInt::get(CI->getType(), 1) : 0; + Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, DL, TLI); + return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr; } - return 0; + return nullptr; } }; struct FPutsOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { ErrorReportingOpt ER(/* StreamArg = */ 1); (void) ER.callOptimizer(Callee, CI, B); // These optimizations require DataLayout. - if (!TD) return 0; + if (!DL) return nullptr; // Require two pointers. Also, we can't optimize if return value is used. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || !CI->use_empty()) - return 0; + return nullptr; // fputs(s,F) --> fwrite(s,1,strlen(s),F) uint64_t Len = GetStringLength(CI->getArgOperand(0)); - if (!Len) return 0; + if (!Len) return nullptr; // Known to have no uses (see above). return EmitFWrite(CI->getArgOperand(0), - ConstantInt::get(TD->getIntPtrType(*Context), Len-1), - CI->getArgOperand(1), B, TD, TLI); + ConstantInt::get(DL->getIntPtrType(*Context), Len-1), + CI->getArgOperand(1), B, DL, TLI); } }; struct PutsOpt : public LibCallOptimization { - virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *callOptimizer(Function *Callee, CallInst *CI, + IRBuilder<> &B) override { // Require one fixed pointer argument and an integer/void result. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() || !(FT->getReturnType()->isIntegerTy() || FT->getReturnType()->isVoidTy())) - return 0; + return nullptr; // Check for a constant string. StringRef Str; if (!getConstantStringInfo(CI->getArgOperand(0), Str)) - return 0; + return nullptr; if (Str.empty() && CI->use_empty()) { // puts("") -> putchar('\n') - Value *Res = EmitPutChar(B.getInt32('\n'), B, TD, TLI); + Value *Res = EmitPutChar(B.getInt32('\n'), B, DL, TLI); if (CI->use_empty() || !Res) return Res; return B.CreateIntCast(Res, CI->getType(), true); } - return 0; + return nullptr; } }; @@ -1908,7 +2014,7 @@ struct PutsOpt : public LibCallOptimization { namespace llvm { class LibCallSimplifierImpl { - const DataLayout *TD; + const DataLayout *DL; const TargetLibraryInfo *TLI; const LibCallSimplifier *LCS; bool UnsafeFPShrink; @@ -1918,11 +2024,11 @@ class LibCallSimplifierImpl { PowOpt Pow; Exp2Opt Exp2; public: - LibCallSimplifierImpl(const DataLayout *TD, const TargetLibraryInfo *TLI, + LibCallSimplifierImpl(const DataLayout *DL, const TargetLibraryInfo *TLI, const LibCallSimplifier *LCS, bool UnsafeFPShrink = false) : Cos(UnsafeFPShrink), Pow(UnsafeFPShrink), Exp2(UnsafeFPShrink) { - this->TD = TD; + this->DL = DL; this->TLI = TLI; this->LCS = LCS; this->UnsafeFPShrink = UnsafeFPShrink; @@ -1975,6 +2081,7 @@ static MemSetOpt MemSet; // Math library call optimizations. static UnaryDoubleFPOpt UnaryDoubleFP(false); +static BinaryDoubleFPOpt BinaryDoubleFP(false); static UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); static SinCosPiOpt SinCosPi; @@ -2009,7 +2116,7 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) { case Intrinsic::exp2: return &Exp2; default: - return 0; + return nullptr; } } @@ -2119,7 +2226,7 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) { case LibFunc::trunc: if (hasFloatVersion(FuncName)) return &UnaryDoubleFP; - return 0; + return nullptr; case LibFunc::acos: case LibFunc::acosh: case LibFunc::asin: @@ -2143,11 +2250,16 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) { case LibFunc::tanh: if (UnsafeFPShrink && hasFloatVersion(FuncName)) return &UnsafeUnaryDoubleFP; - return 0; + return nullptr; + case LibFunc::fmin: + case LibFunc::fmax: + if (hasFloatVersion(FuncName)) + return &BinaryDoubleFP; + return nullptr; case LibFunc::memcpy_chk: return &MemCpyChk; default: - return 0; + return nullptr; } } @@ -2167,7 +2279,7 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) { return &StrNCpyChk; } - return 0; + return nullptr; } @@ -2175,15 +2287,15 @@ Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) { LibCallOptimization *LCO = lookupOptimization(CI); if (LCO) { IRBuilder<> Builder(CI); - return LCO->optimizeCall(CI, TD, TLI, LCS, Builder); + return LCO->optimizeCall(CI, DL, TLI, LCS, Builder); } - return 0; + return nullptr; } -LibCallSimplifier::LibCallSimplifier(const DataLayout *TD, +LibCallSimplifier::LibCallSimplifier(const DataLayout *DL, const TargetLibraryInfo *TLI, bool UnsafeFPShrink) { - Impl = new LibCallSimplifierImpl(TD, TLI, this, UnsafeFPShrink); + Impl = new LibCallSimplifierImpl(DL, TLI, this, UnsafeFPShrink); } LibCallSimplifier::~LibCallSimplifier() { @@ -2191,7 +2303,7 @@ LibCallSimplifier::~LibCallSimplifier() { } Value *LibCallSimplifier::optimizeCall(CallInst *CI) { - if (CI->isNoBuiltin()) return 0; + if (CI->isNoBuiltin()) return nullptr; return Impl->optimizeCall(CI); } @@ -2242,8 +2354,6 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) const { // * sqrt(Nroot(x)) -> pow(x,1/(2*N)) // * sqrt(pow(x,y)) -> pow(|x|,y*0.5) // -// strchr: -// * strchr(p, 0) -> strlen(p) // tan, tanf, tanl: // * tan(atan(x)) -> x // diff --git a/contrib/llvm/lib/Transforms/Utils/SpecialCaseList.cpp b/contrib/llvm/lib/Transforms/Utils/SpecialCaseList.cpp deleted file mode 100644 index 2ef692c..0000000 --- a/contrib/llvm/lib/Transforms/Utils/SpecialCaseList.cpp +++ /dev/null @@ -1,222 +0,0 @@ -//===-- SpecialCaseList.cpp - special case list for sanitizers ------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This is a utility class for instrumentation passes (like AddressSanitizer -// or ThreadSanitizer) to avoid instrumenting some functions or global -// variables, or to instrument some functions or global variables in a specific -// way, based on a user-supplied list. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Transforms/Utils/SpecialCaseList.h" -#include "llvm/ADT/OwningPtr.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/StringSet.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/Regex.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/system_error.h" -#include <string> -#include <utility> - -namespace llvm { - -/// Represents a set of regular expressions. Regular expressions which are -/// "literal" (i.e. no regex metacharacters) are stored in Strings, while all -/// others are represented as a single pipe-separated regex in RegEx. The -/// reason for doing so is efficiency; StringSet is much faster at matching -/// literal strings than Regex. -struct SpecialCaseList::Entry { - StringSet<> Strings; - Regex *RegEx; - - Entry() : RegEx(0) {} - - bool match(StringRef Query) const { - return Strings.count(Query) || (RegEx && RegEx->match(Query)); - } -}; - -SpecialCaseList::SpecialCaseList() : Entries() {} - -SpecialCaseList *SpecialCaseList::create( - const StringRef Path, std::string &Error) { - if (Path.empty()) - return new SpecialCaseList(); - OwningPtr<MemoryBuffer> File; - if (error_code EC = MemoryBuffer::getFile(Path, File)) { - Error = (Twine("Can't open file '") + Path + "': " + EC.message()).str(); - return 0; - } - return create(File.get(), Error); -} - -SpecialCaseList *SpecialCaseList::create( - const MemoryBuffer *MB, std::string &Error) { - OwningPtr<SpecialCaseList> SCL(new SpecialCaseList()); - if (!SCL->parse(MB, Error)) - return 0; - return SCL.take(); -} - -SpecialCaseList *SpecialCaseList::createOrDie(const StringRef Path) { - std::string Error; - if (SpecialCaseList *SCL = create(Path, Error)) - return SCL; - report_fatal_error(Error); -} - -bool SpecialCaseList::parse(const MemoryBuffer *MB, std::string &Error) { - // Iterate through each line in the blacklist file. - SmallVector<StringRef, 16> Lines; - SplitString(MB->getBuffer(), Lines, "\n\r"); - StringMap<StringMap<std::string> > Regexps; - assert(Entries.empty() && - "parse() should be called on an empty SpecialCaseList"); - int LineNo = 1; - for (SmallVectorImpl<StringRef>::iterator I = Lines.begin(), E = Lines.end(); - I != E; ++I, ++LineNo) { - // Ignore empty lines and lines starting with "#" - if (I->empty() || I->startswith("#")) - continue; - // Get our prefix and unparsed regexp. - std::pair<StringRef, StringRef> SplitLine = I->split(":"); - StringRef Prefix = SplitLine.first; - if (SplitLine.second.empty()) { - // Missing ':' in the line. - Error = (Twine("Malformed line ") + Twine(LineNo) + ": '" + - SplitLine.first + "'").str(); - return false; - } - - std::pair<StringRef, StringRef> SplitRegexp = SplitLine.second.split("="); - std::string Regexp = SplitRegexp.first; - StringRef Category = SplitRegexp.second; - - // Backwards compatibility. - if (Prefix == "global-init") { - Prefix = "global"; - Category = "init"; - } else if (Prefix == "global-init-type") { - Prefix = "type"; - Category = "init"; - } else if (Prefix == "global-init-src") { - Prefix = "src"; - Category = "init"; - } - - // See if we can store Regexp in Strings. - if (Regex::isLiteralERE(Regexp)) { - Entries[Prefix][Category].Strings.insert(Regexp); - continue; - } - - // Replace * with .* - for (size_t pos = 0; (pos = Regexp.find("*", pos)) != std::string::npos; - pos += strlen(".*")) { - Regexp.replace(pos, strlen("*"), ".*"); - } - - // Check that the regexp is valid. - Regex CheckRE(Regexp); - std::string REError; - if (!CheckRE.isValid(REError)) { - Error = (Twine("Malformed regex in line ") + Twine(LineNo) + ": '" + - SplitLine.second + "': " + REError).str(); - return false; - } - - // Add this regexp into the proper group by its prefix. - if (!Regexps[Prefix][Category].empty()) - Regexps[Prefix][Category] += "|"; - Regexps[Prefix][Category] += "^" + Regexp + "$"; - } - - // Iterate through each of the prefixes, and create Regexs for them. - for (StringMap<StringMap<std::string> >::const_iterator I = Regexps.begin(), - E = Regexps.end(); - I != E; ++I) { - for (StringMap<std::string>::const_iterator II = I->second.begin(), - IE = I->second.end(); - II != IE; ++II) { - Entries[I->getKey()][II->getKey()].RegEx = new Regex(II->getValue()); - } - } - return true; -} - -SpecialCaseList::~SpecialCaseList() { - for (StringMap<StringMap<Entry> >::iterator I = Entries.begin(), - E = Entries.end(); - I != E; ++I) { - for (StringMap<Entry>::const_iterator II = I->second.begin(), - IE = I->second.end(); - II != IE; ++II) { - delete II->second.RegEx; - } - } -} - -bool SpecialCaseList::isIn(const Function& F, const StringRef Category) const { - return isIn(*F.getParent(), Category) || - inSectionCategory("fun", F.getName(), Category); -} - -static StringRef GetGlobalTypeString(const GlobalValue &G) { - // Types of GlobalVariables are always pointer types. - Type *GType = G.getType()->getElementType(); - // For now we support blacklisting struct types only. - if (StructType *SGType = dyn_cast<StructType>(GType)) { - if (!SGType->isLiteral()) - return SGType->getName(); - } - return "<unknown type>"; -} - -bool SpecialCaseList::isIn(const GlobalVariable &G, - const StringRef Category) const { - return isIn(*G.getParent(), Category) || - inSectionCategory("global", G.getName(), Category) || - inSectionCategory("type", GetGlobalTypeString(G), Category); -} - -bool SpecialCaseList::isIn(const GlobalAlias &GA, - const StringRef Category) const { - if (isIn(*GA.getParent(), Category)) - return true; - - if (isa<FunctionType>(GA.getType()->getElementType())) - return inSectionCategory("fun", GA.getName(), Category); - - return inSectionCategory("global", GA.getName(), Category) || - inSectionCategory("type", GetGlobalTypeString(GA), Category); -} - -bool SpecialCaseList::isIn(const Module &M, const StringRef Category) const { - return inSectionCategory("src", M.getModuleIdentifier(), Category); -} - -bool SpecialCaseList::inSectionCategory(const StringRef Section, - const StringRef Query, - const StringRef Category) const { - StringMap<StringMap<Entry> >::const_iterator I = Entries.find(Section); - if (I == Entries.end()) return false; - StringMap<Entry>::const_iterator II = I->second.find(Category); - if (II == I->second.end()) return false; - - return II->getValue().match(Query); -} - -} // namespace llvm diff --git a/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp index 560f581..0c2fc0a 100644 --- a/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp +++ b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp @@ -59,7 +59,7 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) { // Then unreachable blocks. if (UnreachableBlocks.empty()) { - UnreachableBlock = 0; + UnreachableBlock = nullptr; } else if (UnreachableBlocks.size() == 1) { UnreachableBlock = UnreachableBlocks.front(); } else { @@ -77,7 +77,7 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) { // Now handle return blocks. if (ReturningBlocks.empty()) { - ReturnBlock = 0; + ReturnBlock = nullptr; return false; // No blocks return } else if (ReturningBlocks.size() == 1) { ReturnBlock = ReturningBlocks.front(); // Already has a single return block @@ -91,9 +91,9 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) { BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), "UnifiedReturnBlock", &F); - PHINode *PN = 0; + PHINode *PN = nullptr; if (F.getReturnType()->isVoidTy()) { - ReturnInst::Create(F.getContext(), NULL, NewRetBlock); + ReturnInst::Create(F.getContext(), nullptr, NewRetBlock); } else { // If the function doesn't return void... add a PHI node to the block... PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(), diff --git a/contrib/llvm/lib/Transforms/Utils/Utils.cpp b/contrib/llvm/lib/Transforms/Utils/Utils.cpp index c3df215..ed4f45c 100644 --- a/contrib/llvm/lib/Transforms/Utils/Utils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/Utils.cpp @@ -13,14 +13,15 @@ //===----------------------------------------------------------------------===// #include "llvm/InitializePasses.h" -#include "llvm/PassRegistry.h" #include "llvm-c/Initialization.h" +#include "llvm/PassRegistry.h" using namespace llvm; /// initializeTransformUtils - Initialize all passes in the TransformUtils /// library. void llvm::initializeTransformUtils(PassRegistry &Registry) { + initializeAddDiscriminatorsPass(Registry); initializeBreakCriticalEdgesPass(Registry); initializeInstNamerPass(Registry); initializeLCSSAPass(Registry); diff --git a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp index 457fc80..0f20e6d 100644 --- a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp +++ b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -71,12 +71,12 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, // Check all operands to see if any need to be remapped. for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i) { Value *OP = MD->getOperand(i); - if (OP == 0) continue; + if (!OP) continue; Value *Mapped_OP = MapValue(OP, VM, Flags, TypeMapper, Materializer); // Use identity map if Mapped_Op is null and we can ignore missing // entries. if (Mapped_OP == OP || - (Mapped_OP == 0 && (Flags & RF_IgnoreMissingEntries))) + (Mapped_OP == nullptr && (Flags & RF_IgnoreMissingEntries))) continue; // Ok, at least one operand needs remapping. @@ -84,13 +84,13 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, Elts.reserve(MD->getNumOperands()); for (i = 0; i != e; ++i) { Value *Op = MD->getOperand(i); - if (Op == 0) - Elts.push_back(0); + if (!Op) + Elts.push_back(nullptr); else { Value *Mapped_Op = MapValue(Op, VM, Flags, TypeMapper, Materializer); // Use identity map if Mapped_Op is null and we can ignore missing // entries. - if (Mapped_Op == 0 && (Flags & RF_IgnoreMissingEntries)) + if (Mapped_Op == nullptr && (Flags & RF_IgnoreMissingEntries)) Mapped_Op = Op; Elts.push_back(Mapped_Op); } @@ -112,8 +112,8 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, // Okay, this either must be a constant (which may or may not be mappable) or // is something that is not in the mapping table. Constant *C = const_cast<Constant*>(dyn_cast<Constant>(V)); - if (C == 0) - return 0; + if (!C) + return nullptr; if (BlockAddress *BA = dyn_cast<BlockAddress>(C)) { Function *F = @@ -126,7 +126,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, // Otherwise, we have some other constant to remap. Start by checking to see // if all operands have an identity remapping. unsigned OpNo = 0, NumOperands = C->getNumOperands(); - Value *Mapped = 0; + Value *Mapped = nullptr; for (; OpNo != NumOperands; ++OpNo) { Value *Op = C->getOperand(OpNo); Mapped = MapValue(Op, VM, Flags, TypeMapper, Materializer); @@ -187,7 +187,7 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap, for (User::op_iterator op = I->op_begin(), E = I->op_end(); op != E; ++op) { Value *V = MapValue(*op, VMap, Flags, TypeMapper, Materializer); // If we aren't ignoring missing entries, assert that something happened. - if (V != 0) + if (V) *op = V; else assert((Flags & RF_IgnoreMissingEntries) && @@ -199,7 +199,7 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap, for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { Value *V = MapValue(PN->getIncomingBlock(i), VMap, Flags); // If we aren't ignoring missing entries, assert that something happened. - if (V != 0) + if (V) PN->setIncomingBlock(i, cast<BasicBlock>(V)); else assert((Flags & RF_IgnoreMissingEntries) && diff --git a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp index c5e1dcb..28ec83b 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp @@ -15,7 +15,6 @@ //===----------------------------------------------------------------------===// #define BBV_NAME "bb-vectorize" -#define DEBUG_TYPE BBV_NAME #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" @@ -26,7 +25,6 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -34,6 +32,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -41,15 +40,17 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Type.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; +#define DEBUG_TYPE BBV_NAME + static cl::opt<bool> IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), cl::Hidden, cl::desc("Ignore target information")); @@ -122,6 +123,10 @@ NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize floating-point math intrinsics")); static cl::opt<bool> + NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize BitManipulation intrinsics")); + +static cl::opt<bool> NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); @@ -199,10 +204,11 @@ namespace { BBVectorize(Pass *P, const VectorizeConfig &C) : BasicBlockPass(ID), Config(C) { AA = &P->getAnalysis<AliasAnalysis>(); - DT = &P->getAnalysis<DominatorTree>(); + DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SE = &P->getAnalysis<ScalarEvolution>(); - TD = P->getAnalysisIfAvailable<DataLayout>(); - TTI = IgnoreTargetInfo ? 0 : &P->getAnalysis<TargetTransformInfo>(); + DataLayoutPass *DLP = P->getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; + TTI = IgnoreTargetInfo ? nullptr : &P->getAnalysis<TargetTransformInfo>(); } typedef std::pair<Value *, Value *> ValuePair; @@ -214,7 +220,7 @@ namespace { AliasAnalysis *AA; DominatorTree *DT; ScalarEvolution *SE; - DataLayout *TD; + const DataLayout *DL; const TargetTransformInfo *TTI; // FIXME: const correct? @@ -278,7 +284,7 @@ namespace { bool trackUsesOfI(DenseSet<Value *> &Users, AliasSetTracker &WriteSet, Instruction *I, Instruction *J, bool UpdateUsers = true, - DenseSet<ValuePair> *LoadMoveSetPairs = 0); + DenseSet<ValuePair> *LoadMoveSetPairs = nullptr); void computePairsConnectedTo( DenseMap<Value *, std::vector<Value *> > &CandidatePairs, @@ -291,8 +297,8 @@ namespace { bool pairsConflict(ValuePair P, ValuePair Q, DenseSet<ValuePair> &PairableInstUsers, DenseMap<ValuePair, std::vector<ValuePair> > - *PairableInstUserMap = 0, - DenseSet<VPPair> *PairableInstUserPairSet = 0); + *PairableInstUserMap = nullptr, + DenseSet<VPPair> *PairableInstUserPairSet = nullptr); bool pairWillFormCycle(ValuePair P, DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers, @@ -388,6 +394,8 @@ namespace { void combineMetadata(Instruction *K, const Instruction *J); bool vectorizeBB(BasicBlock &BB) { + if (skipOptnoneFunction(BB)) + return false; if (!DT->isReachableFromEntry(&BB)) { DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() << " in " << BB.getParent()->getName() << "\n"); @@ -428,24 +436,27 @@ namespace { return changed; } - virtual bool runOnBasicBlock(BasicBlock &BB) { + bool runOnBasicBlock(BasicBlock &BB) override { + // OptimizeNone check deferred to vectorizeBB(). + AA = &getAnalysis<AliasAnalysis>(); - DT = &getAnalysis<DominatorTree>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SE = &getAnalysis<ScalarEvolution>(); - TD = getAnalysisIfAvailable<DataLayout>(); - TTI = IgnoreTargetInfo ? 0 : &getAnalysis<TargetTransformInfo>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; + TTI = IgnoreTargetInfo ? nullptr : &getAnalysis<TargetTransformInfo>(); return vectorizeBB(BB); } - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { BasicBlockPass::getAnalysisUsage(AU); AU.addRequired<AliasAnalysis>(); - AU.addRequired<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<ScalarEvolution>(); AU.addRequired<TargetTransformInfo>(); AU.addPreserved<AliasAnalysis>(); - AU.addPreserved<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<ScalarEvolution>(); AU.setPreservesCFG(); } @@ -528,7 +539,11 @@ namespace { // Returns the cost of the provided instruction using TTI. // This does not handle loads and stores. - unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) { + unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2, + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue, + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_AnyValue) { switch (Opcode) { default: break; case Instruction::GetElementPtr: @@ -558,7 +573,7 @@ namespace { case Instruction::And: case Instruction::Or: case Instruction::Xor: - return TTI->getArithmeticInstrCost(Opcode, T1); + return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK); case Instruction::Select: case Instruction::ICmp: case Instruction::FCmp: @@ -626,11 +641,11 @@ namespace { int64_t Offset = IntOff->getSExtValue(); Type *VTy = IPtr->getType()->getPointerElementType(); - int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy); + int64_t VTyTSS = (int64_t) DL->getTypeStoreSize(VTy); Type *VTy2 = JPtr->getType()->getPointerElementType(); if (VTy != VTy2 && Offset < 0) { - int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2); + int64_t VTy2TSS = (int64_t) DL->getTypeStoreSize(VTy2); OffsetInElmts = Offset/VTy2TSS; return (abs64(Offset) % VTy2TSS) == 0; } @@ -664,7 +679,20 @@ namespace { case Intrinsic::exp: case Intrinsic::exp2: case Intrinsic::pow: + case Intrinsic::round: + case Intrinsic::copysign: + case Intrinsic::ceil: + case Intrinsic::nearbyint: + case Intrinsic::rint: + case Intrinsic::trunc: + case Intrinsic::floor: + case Intrinsic::fabs: return Config.VectorizeMath; + case Intrinsic::bswap: + case Intrinsic::ctpop: + case Intrinsic::ctlz: + case Intrinsic::cttz: + return Config.VectorizeBitManipulations; case Intrinsic::fma: case Intrinsic::fmuladd: return Config.VectorizeFMA; @@ -813,7 +841,7 @@ namespace { // It is important to cleanup here so that future iterations of this // function have less work to do. - (void) SimplifyInstructionsInBlock(&BB, TD, AA->getTargetLibraryInfo()); + (void) SimplifyInstructionsInBlock(&BB, DL, AA->getTargetLibraryInfo()); return true; } @@ -868,7 +896,7 @@ namespace { } // We can't vectorize memory operations without target data - if (TD == 0 && IsSimpleLoadStore) + if (!DL && IsSimpleLoadStore) return false; Type *T1, *T2; @@ -905,7 +933,7 @@ namespace { if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy()) return false; - if ((!Config.VectorizePointers || TD == 0) && + if ((!Config.VectorizePointers || !DL) && (T1->getScalarType()->isPointerTy() || T2->getScalarType()->isPointerTy())) return false; @@ -969,7 +997,7 @@ namespace { // with the lower offset has an alignment suitable for the // vector type. - unsigned VecAlignment = TD->getPrefTypeAlignment(VType); + unsigned VecAlignment = DL->getPrefTypeAlignment(VType); if (BottomAlignment < VecAlignment) return false; } @@ -1009,13 +1037,49 @@ namespace { unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2); Type *VT1 = getVecTypeForPair(IT1, JT1), *VT2 = getVecTypeForPair(IT2, JT2); + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_AnyValue; + + // On some targets (example X86) the cost of a vector shift may vary + // depending on whether the second operand is a Uniform or + // NonUniform Constant. + switch (I->getOpcode()) { + default : break; + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + + // If both I and J are scalar shifts by constant, then the + // merged vector shift count would be either a constant splat value + // or a non-uniform vector of constants. + if (ConstantInt *CII = dyn_cast<ConstantInt>(I->getOperand(1))) { + if (ConstantInt *CIJ = dyn_cast<ConstantInt>(J->getOperand(1))) + Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue : + TargetTransformInfo::OK_NonUniformConstantValue; + } else { + // Check for a splat of a constant or for a non uniform vector + // of constants. + Value *IOp = I->getOperand(1); + Value *JOp = J->getOperand(1); + if ((isa<ConstantVector>(IOp) || isa<ConstantDataVector>(IOp)) && + (isa<ConstantVector>(JOp) || isa<ConstantDataVector>(JOp))) { + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + Constant *SplatValue = cast<Constant>(IOp)->getSplatValue(); + if (SplatValue != nullptr && + SplatValue == cast<Constant>(JOp)->getSplatValue()) + Op2VK = TargetTransformInfo::OK_UniformConstantValue; + } + } + } // Note that this procedure is incorrect for insert and extract element // instructions (because combining these often results in a shuffle), // but this cost is ignored (because insert and extract element // instructions are assigned a zero depth factor and are not really // fused in general). - unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2); + unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK); if (VCost > ICost + JCost) return false; @@ -1033,13 +1097,14 @@ namespace { CostSavings = ICost + JCost - VCost; } - // The powi intrinsic is special because only the first argument is - // vectorized, the second arguments must be equal. + // The powi,ctlz,cttz intrinsics are special because only the first + // argument is vectorized, the second arguments must be equal. CallInst *CI = dyn_cast<CallInst>(I); Function *FI; if (CI && (FI = CI->getCalledFunction())) { Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID(); - if (IID == Intrinsic::powi) { + if (IID == Intrinsic::powi || IID == Intrinsic::ctlz || + IID == Intrinsic::cttz) { Value *A1I = CI->getArgOperand(1), *A1J = cast<CallInst>(J)->getArgOperand(1); const SCEV *A1ISCEV = SE->getSCEV(A1I), @@ -1063,7 +1128,8 @@ namespace { assert(CI->getNumArgOperands() == CJ->getNumArgOperands() && "Intrinsic argument counts differ"); for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - if (IID == Intrinsic::powi && i == 1) + if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz || + IID == Intrinsic::cttz) && i == 1) Tys.push_back(CI->getArgOperand(i)->getType()); else Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(), @@ -1185,7 +1251,7 @@ namespace { if (I->mayWriteToMemory()) WriteSet.add(I); bool JAfterStart = IAfterStart; - BasicBlock::iterator J = llvm::next(I); + BasicBlock::iterator J = std::next(I); for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { if (J == Start) JAfterStart = true; @@ -1230,7 +1296,7 @@ namespace { // The next call to this function must start after the last instruction // selected during this invocation. if (JAfterStart) { - Start = llvm::next(J); + Start = std::next(J); IAfterStart = JAfterStart = false; } @@ -1272,13 +1338,15 @@ namespace { // For each possible pairing for this variable, look at the uses of // the first value... - for (Value::use_iterator I = P.first->use_begin(), - E = P.first->use_end(); I != E; ++I) { - if (isa<LoadInst>(*I)) { + for (Value::user_iterator I = P.first->user_begin(), + E = P.first->user_end(); + I != E; ++I) { + User *UI = *I; + if (isa<LoadInst>(UI)) { // A pair cannot be connected to a load because the load only takes one // operand (the address) and it is a scalar even after vectorization. continue; - } else if ((SI = dyn_cast<StoreInst>(*I)) && + } else if ((SI = dyn_cast<StoreInst>(UI)) && P.first == SI->getPointerOperand()) { // Similarly, a pair cannot be connected to a store through its // pointer operand. @@ -1287,22 +1355,21 @@ namespace { // For each use of the first variable, look for uses of the second // variable... - for (Value::use_iterator J = P.second->use_begin(), - E2 = P.second->use_end(); J != E2; ++J) { - if ((SJ = dyn_cast<StoreInst>(*J)) && + for (User *UJ : P.second->users()) { + if ((SJ = dyn_cast<StoreInst>(UJ)) && P.second == SJ->getPointerOperand()) continue; // Look for <I, J>: - if (CandidatePairsSet.count(ValuePair(*I, *J))) { - VPPair VP(P, ValuePair(*I, *J)); + if (CandidatePairsSet.count(ValuePair(UI, UJ))) { + VPPair VP(P, ValuePair(UI, UJ)); ConnectedPairs[VP.first].push_back(VP.second); PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect)); } // Look for <J, I>: - if (CandidatePairsSet.count(ValuePair(*J, *I))) { - VPPair VP(P, ValuePair(*J, *I)); + if (CandidatePairsSet.count(ValuePair(UJ, UI))) { + VPPair VP(P, ValuePair(UJ, UI)); ConnectedPairs[VP.first].push_back(VP.second); PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap)); } @@ -1311,13 +1378,14 @@ namespace { if (Config.SplatBreaksChain) continue; // Look for cases where just the first value in the pair is used by // both members of another pair (splatting). - for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) { - if ((SJ = dyn_cast<StoreInst>(*J)) && + for (Value::user_iterator J = P.first->user_begin(); J != E; ++J) { + User *UJ = *J; + if ((SJ = dyn_cast<StoreInst>(UJ)) && P.first == SJ->getPointerOperand()) continue; - if (CandidatePairsSet.count(ValuePair(*I, *J))) { - VPPair VP(P, ValuePair(*I, *J)); + if (CandidatePairsSet.count(ValuePair(UI, UJ))) { + VPPair VP(P, ValuePair(UI, UJ)); ConnectedPairs[VP.first].push_back(VP.second); PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); } @@ -1327,21 +1395,24 @@ namespace { if (Config.SplatBreaksChain) return; // Look for cases where just the second value in the pair is used by // both members of another pair (splatting). - for (Value::use_iterator I = P.second->use_begin(), - E = P.second->use_end(); I != E; ++I) { - if (isa<LoadInst>(*I)) + for (Value::user_iterator I = P.second->user_begin(), + E = P.second->user_end(); + I != E; ++I) { + User *UI = *I; + if (isa<LoadInst>(UI)) continue; - else if ((SI = dyn_cast<StoreInst>(*I)) && + else if ((SI = dyn_cast<StoreInst>(UI)) && P.second == SI->getPointerOperand()) continue; - for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) { - if ((SJ = dyn_cast<StoreInst>(*J)) && + for (Value::user_iterator J = P.second->user_begin(); J != E; ++J) { + User *UJ = *J; + if ((SJ = dyn_cast<StoreInst>(UJ)) && P.second == SJ->getPointerOperand()) continue; - if (CandidatePairsSet.count(ValuePair(*I, *J))) { - VPPair VP(P, ValuePair(*I, *J)); + if (CandidatePairsSet.count(ValuePair(UI, UJ))) { + VPPair VP(P, ValuePair(UI, UJ)); ConnectedPairs[VP.first].push_back(VP.second); PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); } @@ -1407,7 +1478,7 @@ namespace { AliasSetTracker WriteSet(*AA); if (I->mayWriteToMemory()) WriteSet.add(I); - for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) { + for (BasicBlock::iterator J = std::next(I); J != E; ++J) { (void) trackUsesOfI(Users, WriteSet, I, J); if (J == EL) @@ -1614,8 +1685,9 @@ namespace { C2->first.second == C->first.first || C2->first.second == C->first.second || pairsConflict(C2->first, C->first, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : 0, - UseCycleCheck ? &PairableInstUserPairSet : 0)) { + UseCycleCheck ? &PairableInstUserMap : nullptr, + UseCycleCheck ? &PairableInstUserPairSet + : nullptr)) { if (C2->second >= C->second) { CanAdd = false; break; @@ -1635,8 +1707,9 @@ namespace { T->second == C->first.first || T->second == C->first.second || pairsConflict(*T, C->first, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : 0, - UseCycleCheck ? &PairableInstUserPairSet : 0)) { + UseCycleCheck ? &PairableInstUserMap : nullptr, + UseCycleCheck ? &PairableInstUserPairSet + : nullptr)) { CanAdd = false; break; } @@ -1653,8 +1726,9 @@ namespace { C2->first.second == C->first.first || C2->first.second == C->first.second || pairsConflict(C2->first, C->first, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : 0, - UseCycleCheck ? &PairableInstUserPairSet : 0)) { + UseCycleCheck ? &PairableInstUserMap : nullptr, + UseCycleCheck ? &PairableInstUserPairSet + : nullptr)) { CanAdd = false; break; } @@ -1669,8 +1743,9 @@ namespace { ChosenPairs.begin(), E2 = ChosenPairs.end(); C2 != E2; ++C2) { if (pairsConflict(*C2, C->first, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : 0, - UseCycleCheck ? &PairableInstUserPairSet : 0)) { + UseCycleCheck ? &PairableInstUserMap : nullptr, + UseCycleCheck ? &PairableInstUserPairSet + : nullptr)) { CanAdd = false; break; } @@ -1751,8 +1826,8 @@ namespace { for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(), E = ChosenPairs.end(); C != E; ++C) { if (pairsConflict(*C, IJ, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : 0, - UseCycleCheck ? &PairableInstUserPairSet : 0)) { + UseCycleCheck ? &PairableInstUserMap : nullptr, + UseCycleCheck ? &PairableInstUserPairSet : nullptr)) { DoesConflict = true; break; } @@ -1901,16 +1976,15 @@ namespace { Type *VTy = getVecTypeForPair(Ty1, Ty2); bool NeedsExtraction = false; - for (Value::use_iterator I = S->first->use_begin(), - IE = S->first->use_end(); I != IE; ++I) { - if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) { + for (User *U : S->first->users()) { + if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) { // Shuffle can be folded if it has no other input if (isa<UndefValue>(SI->getOperand(1))) continue; } - if (isa<ExtractElementInst>(*I)) + if (isa<ExtractElementInst>(U)) continue; - if (PrunedDAGInstrs.count(*I)) + if (PrunedDAGInstrs.count(U)) continue; NeedsExtraction = true; break; @@ -1933,16 +2007,15 @@ namespace { } NeedsExtraction = false; - for (Value::use_iterator I = S->second->use_begin(), - IE = S->second->use_end(); I != IE; ++I) { - if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) { + for (User *U : S->second->users()) { + if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) { // Shuffle can be folded if it has no other input if (isa<UndefValue>(SI->getOperand(1))) continue; } - if (isa<ExtractElementInst>(*I)) + if (isa<ExtractElementInst>(U)) continue; - if (PrunedDAGInstrs.count(*I)) + if (PrunedDAGInstrs.count(U)) continue; NeedsExtraction = true; break; @@ -2324,7 +2397,7 @@ namespace { } while ((LIENext = dyn_cast<InsertElementInst>(LIENext->getOperand(0)))); - LIENext = 0; + LIENext = nullptr; Value *LIEPrev = UndefValue::get(ArgTypeH); for (unsigned i = 0; i < numElemL; ++i) { if (isa<UndefValue>(VectElemts[i])) continue; @@ -2392,14 +2465,14 @@ namespace { if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) { // We can have at most two unique vector inputs. bool CanUseInputs = true; - Value *I1, *I2 = 0; + Value *I1, *I2 = nullptr; if (LEE) { I1 = LEE->getOperand(0); } else { I1 = LSV->getOperand(0); I2 = LSV->getOperand(1); if (I2 == I1 || isa<UndefValue>(I2)) - I2 = 0; + I2 = nullptr; } if (HEE) { @@ -2715,10 +2788,11 @@ namespace { ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType); continue; - } else if (IID == Intrinsic::powi && o == 1) { - // The second argument of powi is a single integer and we've already - // checked that both arguments are equal. As a result, we just keep - // I's second argument. + } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz || + IID == Intrinsic::cttz) && o == 1) { + // The second argument of powi/ctlz/cttz is a single integer/constant + // and we've already checked that both arguments are equal. + // As a result, we just keep I's second argument. ReplacedOperands[o] = I->getOperand(o); continue; } @@ -2795,7 +2869,7 @@ namespace { DenseSet<ValuePair> &LoadMoveSetPairs, Instruction *I, Instruction *J) { // Skip to the first instruction past I. - BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); + BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); DenseSet<Value *> Users; AliasSetTracker WriteSet(*AA); @@ -2817,7 +2891,7 @@ namespace { Instruction *&InsertionPt, Instruction *I, Instruction *J) { // Skip to the first instruction past I. - BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); + BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); DenseSet<Value *> Users; AliasSetTracker WriteSet(*AA); @@ -2848,7 +2922,7 @@ namespace { DenseSet<ValuePair> &LoadMoveSetPairs, Instruction *I) { // Skip to the first instruction past I. - BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); + BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); DenseSet<Value *> Users; AliasSetTracker WriteSet(*AA); @@ -2903,7 +2977,7 @@ namespace { switch (Kind) { default: - K->setMetadata(Kind, 0); // Remove unknown metadata + K->setMetadata(Kind, nullptr); // Remove unknown metadata break; case LLVMContext::MD_tbaa: K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD)); @@ -3074,7 +3148,7 @@ namespace { // Instruction insertion point: Instruction *InsertionPt = K; - Instruction *K1 = 0, *K2 = 0; + Instruction *K1 = nullptr, *K2 = nullptr; replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2); // The use dag of the first original instruction must be moved to after @@ -3119,7 +3193,7 @@ namespace { } // Before removing I, set the iterator to the next instruction. - PI = llvm::next(BasicBlock::iterator(I)); + PI = std::next(BasicBlock::iterator(I)); if (cast<Instruction>(PI) == J) ++PI; @@ -3141,7 +3215,7 @@ static const char bb_vectorize_name[] = "Basic-Block Vectorization"; INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) @@ -3164,6 +3238,7 @@ VectorizeConfig::VectorizeConfig() { VectorizePointers = !::NoPointers; VectorizeCasts = !::NoCasts; VectorizeMath = !::NoMath; + VectorizeBitManipulations = !::NoBitManipulation; VectorizeFMA = !::NoFMA; VectorizeSelect = !::NoSelect; VectorizeCmp = !::NoCmp; diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f9f6b18..531d349 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -42,9 +42,6 @@ // //===----------------------------------------------------------------------===// -#define LV_NAME "loop-vectorize" -#define DEBUG_TYPE LV_NAME - #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/EquivalenceClasses.h" @@ -54,9 +51,11 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" @@ -65,34 +64,45 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Analysis/Verifier.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/IR/Verifier.h" #include "llvm/Pass.h" +#include "llvm/Support/BranchProbability.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/PatternMatch.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Support/ValueHandle.h" -#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/VectorUtils.h" #include <algorithm> #include <map> +#include <tuple> using namespace llvm; using namespace llvm::PatternMatch; +#define LV_NAME "loop-vectorize" +#define DEBUG_TYPE LV_NAME + +STATISTIC(LoopsVectorized, "Number of loops vectorized"); +STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); + static cl::opt<unsigned> VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect.")); @@ -114,6 +124,21 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), "trip count that is smaller than this " "value.")); +/// This enables versioning on the strides of symbolically striding memory +/// accesses in code like the following. +/// for (i = 0; i < N; ++i) +/// A[i * Stride1] += B[i * Stride2] ... +/// +/// Will be roughly translated to +/// if (Stride1 == 1 && Stride2 == 1) { +/// for (i = 0; i < N; i+=4) +/// A[i:i+3] += ... +/// } else +/// ... +static cl::opt<bool> EnableMemAccessVersioning( + "enable-mem-access-versioning", cl::init(true), cl::Hidden, + cl::desc("Enable symblic stride memory access versioning")); + /// We don't unroll loops with a known constant trip count below this number. static const unsigned TinyTripCountUnrollThreshold = 128; @@ -124,11 +149,60 @@ static const unsigned RuntimeMemoryCheckThreshold = 8; /// Maximum simd width. static const unsigned MaxVectorWidth = 64; +static cl::opt<unsigned> ForceTargetNumScalarRegs( + "force-target-num-scalar-regs", cl::init(0), cl::Hidden, + cl::desc("A flag that overrides the target's number of scalar registers.")); + +static cl::opt<unsigned> ForceTargetNumVectorRegs( + "force-target-num-vector-regs", cl::init(0), cl::Hidden, + cl::desc("A flag that overrides the target's number of vector registers.")); + /// Maximum vectorization unroll count. static const unsigned MaxUnrollFactor = 16; -/// The cost of a loop that is considered 'small' by the unroller. -static const unsigned SmallLoopCost = 20; +static cl::opt<unsigned> ForceTargetMaxScalarUnrollFactor( + "force-target-max-scalar-unroll", cl::init(0), cl::Hidden, + cl::desc("A flag that overrides the target's max unroll factor for scalar " + "loops.")); + +static cl::opt<unsigned> ForceTargetMaxVectorUnrollFactor( + "force-target-max-vector-unroll", cl::init(0), cl::Hidden, + cl::desc("A flag that overrides the target's max unroll factor for " + "vectorized loops.")); + +static cl::opt<unsigned> ForceTargetInstructionCost( + "force-target-instruction-cost", cl::init(0), cl::Hidden, + cl::desc("A flag that overrides the target's expected cost for " + "an instruction to a single constant value. Mostly " + "useful for getting consistent testing.")); + +static cl::opt<unsigned> SmallLoopCost( + "small-loop-cost", cl::init(20), cl::Hidden, + cl::desc("The cost of a loop that is considered 'small' by the unroller.")); + +static cl::opt<bool> LoopVectorizeWithBlockFrequency( + "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, + cl::desc("Enable the use of the block frequency analysis to access PGO " + "heuristics minimizing code growth in cold regions and being more " + "aggressive in hot regions.")); + +// Runtime unroll loops for load/store throughput. +static cl::opt<bool> EnableLoadStoreRuntimeUnroll( + "enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden, + cl::desc("Enable runtime unrolling until load/store ports are saturated")); + +/// The number of stores in a loop that are allowed to need predication. +static cl::opt<unsigned> NumberOfStoresToPredicate( + "vectorize-num-stores-pred", cl::init(1), cl::Hidden, + cl::desc("Max number of stores to be predicated behind an if.")); + +static cl::opt<bool> EnableIndVarRegisterHeur( + "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, + cl::desc("Count the induction variable only once when unrolling")); + +static cl::opt<bool> EnableCondStoresVectorization( + "enable-cond-stores-vec", cl::init(false), cl::Hidden, + cl::desc("Enable if predication of stores during vectorization.")); namespace { @@ -136,6 +210,29 @@ namespace { class LoopVectorizationLegality; class LoopVectorizationCostModel; +/// Optimization analysis message produced during vectorization. Messages inform +/// the user why vectorization did not occur. +class Report { + std::string Message; + raw_string_ostream Out; + Instruction *Instr; + +public: + Report(Instruction *I = nullptr) : Out(Message), Instr(I) { + Out << "loop not vectorized: "; + } + + template <typename A> Report &operator<<(const A &Value) { + Out << Value; + return *this; + } + + Instruction *getInstr() { return Instr; } + + std::string &str() { return Out.str(); } + operator Twine() { return Out.str(); } +}; + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -153,20 +250,22 @@ class LoopVectorizationCostModel; class InnerLoopVectorizer { public: InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, DataLayout *DL, + DominatorTree *DT, const DataLayout *DL, const TargetLibraryInfo *TLI, unsigned VecWidth, unsigned UnrollFactor) : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI), - VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0), - OldInduction(0), WidenMap(UnrollFactor) {} + VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), + Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), + Legal(nullptr) {} // Perform the actual loop widening (vectorization). - void vectorize(LoopVectorizationLegality *Legal) { + void vectorize(LoopVectorizationLegality *L) { + Legal = L; // Create a new empty loop. Unlink the old loop and connect the new one. - createEmptyLoop(Legal); + createEmptyLoop(); // Widen each instruction in the old loop to a new one in the new loop. // Use the Legality module to find the induction and reduction variables. - vectorizeLoop(Legal); + vectorizeLoop(); // Register the new loop and update the analysis passes. updateAnalysis(); } @@ -186,14 +285,23 @@ protected: typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>, VectorParts> EdgeMaskCache; - /// Add code that checks at runtime if the accessed arrays overlap. - /// Returns the comparator value or NULL if no check is needed. - Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal, - Instruction *Loc); + /// \brief Add code that checks at runtime if the accessed arrays overlap. + /// + /// Returns a pair of instructions where the first element is the first + /// instruction generated in possibly a sequence of instructions and the + /// second value is the final comparator value or NULL if no check is needed. + std::pair<Instruction *, Instruction *> addRuntimeCheck(Instruction *Loc); + + /// \brief Add checks for strides that where assumed to be 1. + /// + /// Returns the last check instruction and the first check instruction in the + /// pair as (first, last). + std::pair<Instruction *, Instruction *> addStrideCheck(Instruction *Loc); + /// Create an empty loop, based on the loop ranges of the old loop. - void createEmptyLoop(LoopVectorizationLegality *Legal); + void createEmptyLoop(); /// Copy and widen the instructions from the old loop. - virtual void vectorizeLoop(LoopVectorizationLegality *Legal); + virtual void vectorizeLoop(); /// \brief The Loop exit block may have single value PHI nodes where the /// incoming value is 'Undef'. While vectorizing we only handled real values @@ -210,14 +318,12 @@ protected: VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); /// A helper function to vectorize a single BB within the innermost loop. - void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB, - PhiVector *PV); + void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. void widenPHIInstruction(Instruction *PN, VectorParts &Entry, - LoopVectorizationLegality *Legal, unsigned UF, unsigned VF, PhiVector *PV); /// Insert the new loop to the loop hierarchy and pass manager @@ -225,12 +331,14 @@ protected: void updateAnalysis(); /// This instruction is un-vectorizable. Implement it as a sequence - /// of scalars. - virtual void scalarizeInstruction(Instruction *Instr); + /// of scalars. If \p IfPredicateStore is true we need to 'hide' each + /// scalarized instruction behind an if block predicated on the control + /// dependence of the instruction. + virtual void scalarizeInstruction(Instruction *Instr, + bool IfPredicateStore=false); /// Vectorize Load and Store instructions, - virtual void vectorizeMemoryInstruction(Instruction *Instr, - LoopVectorizationLegality *Legal); + virtual void vectorizeMemoryInstruction(Instruction *Instr); /// Create a broadcast instruction. This method generates a broadcast /// instruction (shuffle) for loop invariant values and for the induction @@ -302,8 +410,10 @@ protected: LoopInfo *LI; /// Dominator Tree. DominatorTree *DT; + /// Alias Analysis. + AliasAnalysis *AA; /// Data Layout. - DataLayout *DL; + const DataLayout *DL; /// Target Library Info. const TargetLibraryInfo *TLI; @@ -330,7 +440,7 @@ protected: ///The ExitBlock of the scalar loop. BasicBlock *LoopExitBlock; ///The vector loop body. - BasicBlock *LoopVectorBody; + SmallVector<BasicBlock *, 4> LoopVectorBody; ///The scalar loop body. BasicBlock *LoopScalarBody; /// A list of all bypass blocks. The first block is the entry of the loop. @@ -345,22 +455,24 @@ protected: /// Maps scalars to widened vectors. ValueMap WidenMap; EdgeMaskCache MaskCache; + + LoopVectorizationLegality *Legal; }; class InnerLoopUnroller : public InnerLoopVectorizer { public: InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, DataLayout *DL, + DominatorTree *DT, const DataLayout *DL, const TargetLibraryInfo *TLI, unsigned UnrollFactor) : InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { } private: - virtual void scalarizeInstruction(Instruction *Instr); - virtual void vectorizeMemoryInstruction(Instruction *Instr, - LoopVectorizationLegality *Legal); - virtual Value *getBroadcastInstrs(Value *V); - virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); - virtual Value *reverseVector(Value *Vec); + void scalarizeInstruction(Instruction *Instr, + bool IfPredicateStore = false) override; + void vectorizeMemoryInstruction(Instruction *Instr) override; + Value *getBroadcastInstrs(Value *V) override; + Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate) override; + Value *reverseVector(Value *Vec) override; }; /// \brief Look for a meaningful debug location on the instruction or it's @@ -391,6 +503,52 @@ static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { B.SetCurrentDebugLocation(DebugLoc()); } +#ifndef NDEBUG +/// \return string containing a file name and a line # for the given loop. +static std::string getDebugLocString(const Loop *L) { + std::string Result; + if (L) { + raw_string_ostream OS(Result); + const DebugLoc LoopDbgLoc = L->getStartLoc(); + if (!LoopDbgLoc.isUnknown()) + LoopDbgLoc.print(L->getHeader()->getContext(), OS); + else + // Just print the module name. + OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); + OS.flush(); + } + return Result; +} +#endif + +/// \brief Propagate known metadata from one instruction to another. +static void propagateMetadata(Instruction *To, const Instruction *From) { + SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; + From->getAllMetadataOtherThanDebugLoc(Metadata); + + for (auto M : Metadata) { + unsigned Kind = M.first; + + // These are safe to transfer (this is safe for TBAA, even when we + // if-convert, because should that metadata have had a control dependency + // on the condition, and thus actually aliased with some other + // non-speculated memory access when the condition was false, this would be + // caught by the runtime overlap checks). + if (Kind != LLVMContext::MD_tbaa && + Kind != LLVMContext::MD_fpmath) + continue; + + To->setMetadata(Kind, M.second); + } +} + +/// \brief Propagate known metadata from one instruction to a vector of others. +static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) { + for (Value *V : To) + if (Instruction *I = dyn_cast<Instruction>(V)) + propagateMetadata(I, From); +} + /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and /// to what vectorization factor. /// This class does not look at the profitability of vectorization, only the @@ -406,11 +564,17 @@ static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { /// induction variable and the different reduction variables. class LoopVectorizationLegality { public: - LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL, - DominatorTree *DT, TargetLibraryInfo *TLI) - : TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI), - Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false), - MaxSafeDepDistBytes(-1U) {} + unsigned NumLoads; + unsigned NumStores; + unsigned NumPredStores; + + LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL, + DominatorTree *DT, TargetLibraryInfo *TLI, + AliasAnalysis *AA, Function *F) + : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL), + DT(DT), TLI(TLI), AA(AA), TheFunction(F), Induction(nullptr), + WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) { + } /// This enum represents the kinds of reductions that we support. enum ReductionKind { @@ -448,7 +612,7 @@ public: /// This struct holds information about reduction variables. struct ReductionDescriptor { - ReductionDescriptor() : StartValue(0), LoopExitInstr(0), + ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr), Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {} ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K, @@ -496,11 +660,12 @@ public: Ends.clear(); IsWritePtr.clear(); DependencySetId.clear(); + AliasSetId.clear(); } /// Insert a pointer and calculate the start and end SCEVs. void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, - unsigned DepSetId); + unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides); /// This flag indicates if we need to add the runtime check. bool Need; @@ -515,12 +680,14 @@ public: /// Holds the id of the set of pointers that could be dependent because of a /// shared underlying object. SmallVector<unsigned, 2> DependencySetId; + /// Holds the id of the disjoint alias set to which this pointer belongs. + SmallVector<unsigned, 2> AliasSetId; }; /// A struct for saving information about induction variables. struct InductionInfo { InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} - InductionInfo() : StartValue(0), IK(IK_NoInduction) {} + InductionInfo() : StartValue(nullptr), IK(IK_NoInduction) {} /// Start value. TrackingVH<Value> StartValue; /// Induction kind. @@ -564,7 +731,7 @@ public: /// pointer itself is an induction variable. /// This check allows us to vectorize A[idx] into a wide load/store. /// Returns: - /// 0 - Stride is unknown or non consecutive. + /// 0 - Stride is unknown or non-consecutive. /// 1 - Address is consecutive. /// -1 - Address is consecutive, and decreasing. int isConsecutivePtr(Value *Ptr); @@ -584,6 +751,13 @@ public: unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + bool hasStride(Value *V) { return StrideSet.count(V); } + bool mustCheckStrides() { return !StrideSet.empty(); } + SmallPtrSet<Value *, 8>::iterator strides_begin() { + return StrideSet.begin(); + } + SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); } + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -626,16 +800,36 @@ private: /// if the PHI is not an induction variable. InductionKind isInductionVariable(PHINode *Phi); + /// \brief Collect memory access with loop invariant strides. + /// + /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop + /// invariant. + void collectStridedAcccess(Value *LoadOrStoreInst); + + /// Report an analysis message to assist the user in diagnosing loops that are + /// not vectorized. + void emitAnalysis(Report &Message) { + DebugLoc DL = TheLoop->getStartLoc(); + if (Instruction *I = Message.getInstr()) + DL = I->getDebugLoc(); + emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE, + *TheFunction, DL, Message.str()); + } + /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. ScalarEvolution *SE; /// DataLayout analysis. - DataLayout *DL; + const DataLayout *DL; /// Dominators. DominatorTree *DT; /// Target Library Info. TargetLibraryInfo *TLI; + /// Alias analysis. + AliasAnalysis *AA; + /// Parent function + Function *TheFunction; // --- vectorization state --- // @@ -664,6 +858,9 @@ private: bool HasFunNoNaNAttr; unsigned MaxSafeDepDistBytes; + + ValueToValueMap Strides; + SmallPtrSet<Value *, 8> StrideSet; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -678,7 +875,7 @@ public: LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, - DataLayout *DL, const TargetLibraryInfo *TLI) + const DataLayout *DL, const TargetLibraryInfo *TLI) : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {} /// Information about vectorization costs @@ -691,7 +888,8 @@ public: /// then this vectorization factor will be selected if vectorization is /// possible. VectorizationFactor selectVectorizationFactor(bool OptForSize, - unsigned UserVF); + unsigned UserVF, + bool ForceVectorization); /// \return The size (in bits) of the widest type in the code that /// needs to be vectorized. We ignore values that remain scalar such as @@ -751,39 +949,39 @@ private: /// Vector target information. const TargetTransformInfo &TTI; /// Target data layout information. - DataLayout *DL; + const DataLayout *DL; /// Target Library Info. const TargetLibraryInfo *TLI; }; /// Utility class for getting and setting loop vectorizer hints in the form /// of loop metadata. -struct LoopVectorizeHints { - /// Vectorization width. - unsigned Width; - /// Vectorization unroll factor. - unsigned Unroll; +class LoopVectorizeHints { +public: + enum ForceKind { + FK_Undefined = -1, ///< Not selected. + FK_Disabled = 0, ///< Forcing disabled. + FK_Enabled = 1, ///< Forcing enabled. + }; LoopVectorizeHints(const Loop *L, bool DisableUnrolling) - : Width(VectorizationFactor) - , Unroll(DisableUnrolling ? 1 : VectorizationUnroll) - , LoopID(L->getLoopID()) { + : Width(VectorizationFactor), + Unroll(DisableUnrolling), + Force(FK_Undefined), + LoopID(L->getLoopID()) { getHints(L); - // The command line options override any loop metadata except for when - // width == 1 which is used to indicate the loop is already vectorized. - if (VectorizationFactor.getNumOccurrences() > 0 && Width != 1) - Width = VectorizationFactor; + // force-vector-unroll overrides DisableUnrolling. if (VectorizationUnroll.getNumOccurrences() > 0) Unroll = VectorizationUnroll; - DEBUG(if (DisableUnrolling && Unroll == 1) - dbgs() << "LV: Unrolling disabled by the pass manager\n"); + DEBUG(if (DisableUnrolling && Unroll == 1) dbgs() + << "LV: Unrolling disabled by the pass manager\n"); } - /// Return the loop vectorizer metadata prefix. - static StringRef Prefix() { return "llvm.vectorizer."; } + /// Return the loop metadata prefix. + static StringRef Prefix() { return "llvm.loop."; } - MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) { + MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) const { SmallVector<Value*, 2> Vals; Vals.push_back(MDString::get(Context, Name)); Vals.push_back(ConstantInt::get(Type::getInt32Ty(Context), V)); @@ -803,8 +1001,10 @@ struct LoopVectorizeHints { for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) Vals.push_back(LoopID->getOperand(i)); - Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width)); - Vals.push_back(createHint(Context, Twine(Prefix(), "unroll").str(), 1)); + Vals.push_back( + createHint(Context, Twine(Prefix(), "vectorize.width").str(), Width)); + Vals.push_back( + createHint(Context, Twine(Prefix(), "interleave.count").str(), 1)); MDNode *NewLoopID = MDNode::get(Context, Vals); // Set operand 0 to refer to the loop id itself. @@ -817,9 +1017,35 @@ struct LoopVectorizeHints { LoopID = NewLoopID; } -private: - MDNode *LoopID; + std::string emitRemark() const { + Report R; + R << "vectorization "; + switch (Force) { + case LoopVectorizeHints::FK_Disabled: + R << "is explicitly disabled"; + break; + case LoopVectorizeHints::FK_Enabled: + R << "is explicitly enabled"; + if (Width != 0 && Unroll != 0) + R << " with width " << Width << " and interleave count " << Unroll; + else if (Width != 0) + R << " with width " << Width; + else if (Unroll != 0) + R << " with interleave count " << Unroll; + break; + case LoopVectorizeHints::FK_Undefined: + R << "was not specified"; + break; + } + return R.str(); + } + unsigned getWidth() const { return Width; } + unsigned getUnroll() const { return Unroll; } + enum ForceKind getForce() const { return Force; } + MDNode *getLoopID() const { return LoopID; } + +private: /// Find hints specified in the loop metadata. void getHints(const Loop *L) { if (!LoopID) @@ -830,7 +1056,7 @@ private: assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - const MDString *S = 0; + const MDString *S = nullptr; SmallVector<Value*, 4> Args; // The expected hint is either a MDString or a MDNode with the first @@ -849,7 +1075,7 @@ private: if (!S) continue; - // Check if the hint starts with the vectorizer prefix. + // Check if the hint starts with the loop metadata prefix. StringRef Hint = S->getString(); if (!Hint.startswith(Prefix())) continue; @@ -867,12 +1093,18 @@ private: if (!C) return; unsigned Val = C->getZExtValue(); - if (Hint == "width") { + if (Hint == "vectorize.width") { if (isPowerOf2_32(Val) && Val <= MaxVectorWidth) Width = Val; else DEBUG(dbgs() << "LV: ignoring invalid width hint metadata\n"); - } else if (Hint == "unroll") { + } else if (Hint == "vectorize.enable") { + if (C->getBitWidth() == 1) + Force = Val == 1 ? LoopVectorizeHints::FK_Enabled + : LoopVectorizeHints::FK_Disabled; + else + DEBUG(dbgs() << "LV: ignoring invalid enable hint metadata\n"); + } else if (Hint == "interleave.count") { if (isPowerOf2_32(Val) && Val <= MaxUnrollFactor) Unroll = Val; else @@ -881,62 +1113,192 @@ private: DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n'); } } + + /// Vectorization width. + unsigned Width; + /// Vectorization unroll factor. + unsigned Unroll; + /// Vectorization forced + enum ForceKind Force; + + MDNode *LoopID; }; +static void emitMissedWarning(Function *F, Loop *L, + const LoopVectorizeHints &LH) { + emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F, + L->getStartLoc(), LH.emitRemark()); + + if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { + if (LH.getWidth() != 1) + emitLoopVectorizeWarning( + F->getContext(), *F, L->getStartLoc(), + "failed explicitly specified loop vectorization"); + else if (LH.getUnroll() != 1) + emitLoopInterleaveWarning( + F->getContext(), *F, L->getStartLoc(), + "failed explicitly specified loop interleaving"); + } +} + +static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { + if (L.empty()) + return V.push_back(&L); + + for (Loop *InnerL : L) + addInnerLoop(*InnerL, V); +} + /// The LoopVectorize Pass. -struct LoopVectorize : public LoopPass { +struct LoopVectorize : public FunctionPass { /// Pass identification, replacement for typeid static char ID; - explicit LoopVectorize(bool NoUnrolling = false) - : LoopPass(ID), DisableUnrolling(NoUnrolling) { + explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true) + : FunctionPass(ID), + DisableUnrolling(NoUnrolling), + AlwaysVectorize(AlwaysVectorize) { initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); } ScalarEvolution *SE; - DataLayout *DL; + const DataLayout *DL; LoopInfo *LI; TargetTransformInfo *TTI; DominatorTree *DT; + BlockFrequencyInfo *BFI; TargetLibraryInfo *TLI; + AliasAnalysis *AA; bool DisableUnrolling; + bool AlwaysVectorize; - virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { - // We only vectorize innermost loops. - if (!L->empty()) - return false; + BlockFrequency ColdEntryFreq; + bool runOnFunction(Function &F) override { SE = &getAnalysis<ScalarEvolution>(); - DL = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; LI = &getAnalysis<LoopInfo>(); TTI = &getAnalysis<TargetTransformInfo>(); - DT = &getAnalysis<DominatorTree>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + BFI = &getAnalysis<BlockFrequencyInfo>(); TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + AA = &getAnalysis<AliasAnalysis>(); + + // Compute some weights outside of the loop over the loops. Compute this + // using a BranchProbability to re-use its scaling math. + const BranchProbability ColdProb(1, 5); // 20% + ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb; // If the target claims to have no vector registers don't attempt // vectorization. if (!TTI->getNumberOfRegisters(true)) return false; - if (DL == NULL) { - DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout\n"); + if (!DL) { + DEBUG(dbgs() << "\nLV: Not vectorizing " << F.getName() + << ": Missing data layout\n"); return false; } - DEBUG(dbgs() << "LV: Checking a loop in \"" << - L->getHeader()->getParent()->getName() << "\"\n"); + // Build up a worklist of inner-loops to vectorize. This is necessary as + // the act of vectorizing or partially unrolling a loop creates new loops + // and can invalidate iterators across the loops. + SmallVector<Loop *, 8> Worklist; + + for (Loop *L : *LI) + addInnerLoop(*L, Worklist); + + LoopsAnalyzed += Worklist.size(); + + // Now walk the identified inner loops. + bool Changed = false; + while (!Worklist.empty()) + Changed |= processLoop(Worklist.pop_back_val()); + + // Process each loop nest in the function. + return Changed; + } + + bool processLoop(Loop *L) { + assert(L->empty() && "Only process inner loops."); + +#ifndef NDEBUG + const std::string DebugLocStr = getDebugLocString(L); +#endif /* NDEBUG */ + + DEBUG(dbgs() << "\nLV: Checking a loop in \"" + << L->getHeader()->getParent()->getName() << "\" from " + << DebugLocStr << "\n"); LoopVectorizeHints Hints(L, DisableUnrolling); - if (Hints.Width == 1 && Hints.Unroll == 1) { - DEBUG(dbgs() << "LV: Not vectorizing.\n"); + DEBUG(dbgs() << "LV: Loop hints:" + << " force=" + << (Hints.getForce() == LoopVectorizeHints::FK_Disabled + ? "disabled" + : (Hints.getForce() == LoopVectorizeHints::FK_Enabled + ? "enabled" + : "?")) << " width=" << Hints.getWidth() + << " unroll=" << Hints.getUnroll() << "\n"); + + // Function containing loop + Function *F = L->getHeader()->getParent(); + + // Looking at the diagnostic output is the only way to determine if a loop + // was vectorized (other than looking at the IR or machine code), so it + // is important to generate an optimization remark for each loop. Most of + // these messages are generated by emitOptimizationRemarkAnalysis. Remarks + // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are + // less verbose reporting vectorized loops and unvectorized loops that may + // benefit from vectorization, respectively. + + if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) { + DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); + emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, + L->getStartLoc(), Hints.emitRemark()); return false; } + if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) { + DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); + emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, + L->getStartLoc(), Hints.emitRemark()); + return false; + } + + if (Hints.getWidth() == 1 && Hints.getUnroll() == 1) { + DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); + emitOptimizationRemarkAnalysis( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + "loop not vectorized: vector width and interleave count are " + "explicitly set to 1"); + return false; + } + + // Check the loop for a trip count threshold: + // do not vectorize loops with a tiny trip count. + BasicBlock *Latch = L->getLoopLatch(); + const unsigned TC = SE->getSmallConstantTripCount(L, Latch); + if (TC > 0u && TC < TinyTripCountVectorThreshold) { + DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " + << "This loop is not worth vectorizing."); + if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) + DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); + else { + DEBUG(dbgs() << "\n"); + emitOptimizationRemarkAnalysis( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + "vectorization is not beneficial and is not explicitly forced"); + return false; + } + } + // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL, DT, TLI); + LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F); if (!LVL.canVectorize()) { - DEBUG(dbgs() << "LV: Not vectorizing.\n"); + DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); + emitMissedWarning(F, L, Hints); return false; } @@ -945,41 +1307,81 @@ struct LoopVectorize : public LoopPass { // Check the function attributes to find out if this function should be // optimized for size. - Function *F = L->getHeader()->getParent(); - Attribute::AttrKind SzAttr = Attribute::OptimizeForSize; - Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat; - unsigned FnIndex = AttributeSet::FunctionIndex; - bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr); - bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr); + bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && + F->hasFnAttribute(Attribute::OptimizeForSize); + + // Compute the weighted frequency of this loop being executed and see if it + // is less than 20% of the function entry baseline frequency. Note that we + // always have a canonical loop here because we think we *can* vectoriez. + // FIXME: This is hidden behind a flag due to pervasive problems with + // exactly what block frequency models. + if (LoopVectorizeWithBlockFrequency) { + BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader()); + if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && + LoopEntryFreq < ColdEntryFreq) + OptForSize = true; + } - if (NoFloat) { + // Check the function attributes to see if implicit floats are allowed.a + // FIXME: This check doesn't seem possibly correct -- what if the loop is + // an integer loop and the vector instructions selected are purely integer + // vector instructions? + if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"); + emitOptimizationRemarkAnalysis( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + "loop not vectorized due to NoImplicitFloat attribute"); + emitMissedWarning(F, L, Hints); return false; } // Select the optimal vectorization factor. - LoopVectorizationCostModel::VectorizationFactor VF; - VF = CM.selectVectorizationFactor(OptForSize, Hints.Width); + const LoopVectorizationCostModel::VectorizationFactor VF = + CM.selectVectorizationFactor(OptForSize, Hints.getWidth(), + Hints.getForce() == + LoopVectorizeHints::FK_Enabled); + // Select the unroll factor. - unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width, - VF.Cost); + const unsigned UF = + CM.selectUnrollFactor(OptForSize, Hints.getUnroll(), VF.Width, VF.Cost); - DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<< - F->getParent()->getModuleIdentifier() << '\n'); + DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " + << DebugLocStr << '\n'); DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n'); if (VF.Width == 1) { - DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); - if (UF == 1) + DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n"); + + if (UF == 1) { + emitOptimizationRemarkAnalysis( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + "not beneficial to vectorize and user disabled interleaving"); return false; + } + DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n"); + + // Report the unrolling decision. + emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + Twine("unrolled with interleaving factor " + + Twine(UF) + + " (vectorization not beneficial)")); + // We decided not to vectorize, but we may want to unroll. + InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF); Unroller.vectorize(&LVL); } else { // If we decided that it is *legal* to vectorize the loop then do it. InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); LB.vectorize(&LVL); + ++LoopsVectorized; + + // Report the vectorization decision. + emitOptimizationRemark( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) + + ", unrolling interleave factor: " + Twine(UF) + ")"); } // Mark the loop as already vectorized to avoid vectorizing again. @@ -989,16 +1391,18 @@ struct LoopVectorize : public LoopPass { return true; } - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - LoopPass::getAnalysisUsage(AU); + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); - AU.addRequired<DominatorTree>(); + AU.addRequired<BlockFrequencyInfo>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfo>(); AU.addRequired<ScalarEvolution>(); AU.addRequired<TargetTransformInfo>(); + AU.addRequired<AliasAnalysis>(); AU.addPreserved<LoopInfo>(); - AU.addPreserved<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<AliasAnalysis>(); } }; @@ -1010,12 +1414,53 @@ struct LoopVectorize : public LoopPass { // LoopVectorizationCostModel. //===----------------------------------------------------------------------===// -void -LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, - Loop *Lp, Value *Ptr, - bool WritePtr, - unsigned DepSetId) { - const SCEV *Sc = SE->getSCEV(Ptr); +static Value *stripIntegerCast(Value *V) { + if (CastInst *CI = dyn_cast<CastInst>(V)) + if (CI->getOperand(0)->getType()->isIntegerTy()) + return CI->getOperand(0); + return V; +} + +///\brief Replaces the symbolic stride in a pointer SCEV expression by one. +/// +/// If \p OrigPtr is not null, use it to look up the stride value instead of +/// \p Ptr. +static const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE, + ValueToValueMap &PtrToStride, + Value *Ptr, Value *OrigPtr = nullptr) { + + const SCEV *OrigSCEV = SE->getSCEV(Ptr); + + // If there is an entry in the map return the SCEV of the pointer with the + // symbolic stride replaced by one. + ValueToValueMap::iterator SI = PtrToStride.find(OrigPtr ? OrigPtr : Ptr); + if (SI != PtrToStride.end()) { + Value *StrideVal = SI->second; + + // Strip casts. + StrideVal = stripIntegerCast(StrideVal); + + // Replace symbolic stride by one. + Value *One = ConstantInt::get(StrideVal->getType(), 1); + ValueToValueMap RewriteMap; + RewriteMap[StrideVal] = One; + + const SCEV *ByOne = + SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true); + DEBUG(dbgs() << "LV: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne + << "\n"); + return ByOne; + } + + // Otherwise, just return the SCEV of the original pointer. + return SE->getSCEV(Ptr); +} + +void LoopVectorizationLegality::RuntimePointerCheck::insert( + ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, + unsigned ASId, ValueToValueMap &Strides) { + // Get the stride replaced scev. + const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr); const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); assert(AR && "Invalid addrec expression"); const SCEV *Ex = SE->getBackedgeTakenCount(Lp); @@ -1025,12 +1470,15 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, Ends.push_back(ScEnd); IsWritePtr.push_back(WritePtr); DependencySetId.push_back(DepSetId); + AliasSetId.push_back(ASId); } Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { // We need to place the broadcast of invariant variables outside the loop. Instruction *Instr = dyn_cast<Instruction>(V); - bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); + bool NewInstr = + (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(), + Instr->getParent()) != LoopVectorBody.end()); bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; // Place the code for broadcasting invariant variables in the new preheader. @@ -1070,7 +1518,7 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, /// \brief Find the operand of the GEP that should be checked for consecutive /// stores. This ignores trailing indices that have no effect on the final /// pointer. -static unsigned getGEPInductionOperand(DataLayout *DL, +static unsigned getGEPInductionOperand(const DataLayout *DL, const GetElementPtrInst *Gep) { unsigned LastOperand = Gep->getNumOperands() - 1; unsigned GEPAllocSize = DL->getTypeAllocSize( @@ -1093,7 +1541,7 @@ static unsigned getGEPInductionOperand(DataLayout *DL, } int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { - assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); + assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); // Make sure that the pointer does not point to structs. if (Ptr->getType()->getPointerElementType()->isAggregateType()) return 0; @@ -1147,7 +1595,27 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // We can emit wide load/stores only if the last non-zero index is the // induction variable. - const SCEV *Last = SE->getSCEV(Gep->getOperand(InductionOperand)); + const SCEV *Last = nullptr; + if (!Strides.count(Gep)) + Last = SE->getSCEV(Gep->getOperand(InductionOperand)); + else { + // Because of the multiplication by a stride we can have a s/zext cast. + // We are going to replace this stride by 1 so the cast is safe to ignore. + // + // %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + // %0 = trunc i64 %indvars.iv to i32 + // %mul = mul i32 %0, %Stride1 + // %idxprom = zext i32 %mul to i64 << Safe cast. + // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom + // + Last = replaceSymbolicStrideSCEV(SE, Strides, + Gep->getOperand(InductionOperand), Gep); + if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last)) + Last = + (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend) + ? C->getOperand() + : Last; + } if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { const SCEV *Step = AR->getStepRecurrence(*SE); @@ -1171,6 +1639,10 @@ InnerLoopVectorizer::getVectorValue(Value *V) { assert(V != Induction && "The new induction variable should not be used."); assert(!V->getType()->isVectorTy() && "Can't widen a vector"); + // If we have a stride that is replaced by one, do it here. + if (Legal->hasStride(V)) + V = ConstantInt::get(V->getType(), 1); + // If we have this scalar in the map, return it. if (WidenMap.has(V)) return WidenMap.get(V); @@ -1192,9 +1664,7 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { "reverse"); } - -void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, - LoopVectorizationLegality *Legal) { +void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast<LoadInst>(Instr); StoreInst *SI = dyn_cast<StoreInst>(Instr); @@ -1213,10 +1683,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; + if (SI && Legal->blockNeedsPredication(SI->getParent())) + return scalarizeInstruction(Instr, true); + if (ScalarAllocatedSize != VectorElementSize) return scalarizeInstruction(Instr); - // If the pointer is loop invariant or if it is non consecutive, + // If the pointer is loop invariant or if it is non-consecutive, // scalarize the load. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; @@ -1304,7 +1777,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment); + StoreInst *NewSI = + Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); + propagateMetadata(NewSI, SI); } return; } @@ -1325,13 +1800,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - Value *LI = Builder.CreateLoad(VecPtr, "wide.load"); - cast<LoadInst>(LI)->setAlignment(Alignment); - Entry[Part] = Reverse ? reverseVector(LI) : LI; + LoadInst *NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); + propagateMetadata(NewLI, LI); + Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; } } -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector<VectorParts, 4> Params; @@ -1371,15 +1846,43 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - Value *UndefVec = IsVoidRetTy ? 0 : + Value *UndefVec = IsVoidRetTy ? nullptr : UndefValue::get(VectorType::get(Instr->getType(), VF)); // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + Instruction *InsertPt = Builder.GetInsertPoint(); + BasicBlock *IfBlock = Builder.GetInsertBlock(); + BasicBlock *CondBlock = nullptr; + + VectorParts Cond; + Loop *VectorLp = nullptr; + if (IfPredicateStore) { + assert(Instr->getParent()->getSinglePredecessor() && + "Only support single predecessor blocks"); + Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), + Instr->getParent()); + VectorLp = LI->getLoopFor(IfBlock); + assert(VectorLp && "Must have a loop for this block"); + } + // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { // For each scalar that we create: for (unsigned Width = 0; Width < VF; ++Width) { + + // Start if-block. + Value *Cmp = nullptr; + if (IfPredicateStore) { + Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); + Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); + CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); + LoopVectorBody.push_back(CondBlock); + VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); + // Update Builder with newly created basic block. + Builder.SetInsertPoint(InsertPt); + } + Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); @@ -1400,18 +1903,75 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { if (!IsVoidRetTy) VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, Builder.getInt32(Width)); + // End if-block. + if (IfPredicateStore) { + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + LoopVectorBody.push_back(NewIfBlock); + VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } } } } -Instruction * -InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, - Instruction *Loc) { +static Instruction *getFirstInst(Instruction *FirstInst, Value *V, + Instruction *Loc) { + if (FirstInst) + return FirstInst; + if (Instruction *I = dyn_cast<Instruction>(V)) + return I->getParent() == Loc->getParent() ? I : nullptr; + return nullptr; +} + +std::pair<Instruction *, Instruction *> +InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { + Instruction *tnullptr = nullptr; + if (!Legal->mustCheckStrides()) + return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr); + + IRBuilder<> ChkBuilder(Loc); + + // Emit checks. + Value *Check = nullptr; + Instruction *FirstInst = nullptr; + for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(), + SE = Legal->strides_end(); + SI != SE; ++SI) { + Value *Ptr = stripIntegerCast(*SI); + Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1), + "stride.chk"); + // Store the first instruction we create. + FirstInst = getFirstInst(FirstInst, C, Loc); + if (Check) + Check = ChkBuilder.CreateOr(Check, C); + else + Check = C; + } + + // We have to do this trickery because the IRBuilder might fold the check to a + // constant expression in which case there is no Instruction anchored in a + // the block. + LLVMContext &Ctx = Loc->getContext(); + Instruction *TheCheck = + BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx)); + ChkBuilder.Insert(TheCheck, "stride.not.one"); + FirstInst = getFirstInst(FirstInst, TheCheck, Loc); + + return std::make_pair(FirstInst, TheCheck); +} + +std::pair<Instruction *, Instruction *> +InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) { LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = Legal->getRuntimePointerCheck(); + Instruction *tnullptr = nullptr; if (!PtrRtCheck->Need) - return NULL; + return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr); unsigned NumPointers = PtrRtCheck->Pointers.size(); SmallVector<TrackingVH<Value> , 2> Starts; @@ -1419,6 +1979,7 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, LLVMContext &Ctx = Loc->getContext(); SCEVExpander Exp(*SE, "induction"); + Instruction *FirstInst = nullptr; for (unsigned i = 0; i < NumPointers; ++i) { Value *Ptr = PtrRtCheck->Pointers[i]; @@ -1445,7 +2006,7 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, IRBuilder<> ChkBuilder(Loc); // Our instructions might fold to a constant. - Value *MemoryRuntimeCheck = 0; + Value *MemoryRuntimeCheck = nullptr; for (unsigned i = 0; i < NumPointers; ++i) { for (unsigned j = i+1; j < NumPointers; ++j) { // No need to check if two readonly pointers intersect. @@ -1455,6 +2016,9 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, // Only need to check pointers between two different dependency sets. if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) continue; + // Only need to check pointers in the same alias set. + if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j]) + continue; unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); @@ -1472,11 +2036,16 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc"); Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); + FirstInst = getFirstInst(FirstInst, Cmp0, Loc); Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); + FirstInst = getFirstInst(FirstInst, Cmp1, Loc); Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); - if (MemoryRuntimeCheck) + FirstInst = getFirstInst(FirstInst, IsConflict, Loc); + if (MemoryRuntimeCheck) { IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx"); + FirstInst = getFirstInst(FirstInst, IsConflict, Loc); + } MemoryRuntimeCheck = IsConflict; } } @@ -1487,30 +2056,33 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, ConstantInt::getTrue(Ctx)); ChkBuilder.Insert(Check, "memcheck.conflict"); - return Check; + FirstInst = getFirstInst(FirstInst, Check, Loc); + return std::make_pair(FirstInst, Check); } -void -InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { +void InnerLoopVectorizer::createEmptyLoop() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the scalar remainder. - [ ] <-- vector loop bypass (may consist of multiple blocks). - / | - / v - | [ ] <-- vector pre header. - | | - | v - | [ ] \ - | [ ]_| <-- vector loop. - | | - \ v - >[ ] <--- middle-block. - / | - / v - | [ ] <--- new preheader. + [ ] <-- Back-edge taken count overflow check. + / | + / v + | [ ] <-- vector loop bypass (may consist of multiple blocks). + | / | + | / v + || [ ] <-- vector pre header. + || | + || v + || [ ] \ + || [ ]_| <-- vector loop. + || | + | \ v + | >[ ] <--- middle-block. + | / | + | / v + -|- >[ ] <--- new preheader. | | | v | [ ] \ @@ -1524,6 +2096,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { BasicBlock *OldBasicBlock = OrigLoop->getHeader(); BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); BasicBlock *ExitBlock = OrigLoop->getExitBlock(); + assert(BypassBlock && "Invalid loop structure"); assert(ExitBlock && "Must have an exit block"); // Some loops have a single integer induction variable, while other loops @@ -1546,18 +2119,30 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { IdxTy->getPrimitiveSizeInBits()) ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); - ExitCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); + const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); // Get the total trip count from the count by adding 1. - ExitCount = SE->getAddExpr(ExitCount, - SE->getConstant(ExitCount->getType(), 1)); + ExitCount = SE->getAddExpr(BackedgeTakeCount, + SE->getConstant(BackedgeTakeCount->getType(), 1)); // Expand the trip count and place the new instructions in the preheader. // Notice that the pre-header does not change, only the loop body. SCEVExpander Exp(*SE, "induction"); - // Count holds the overall loop count (N). - Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - BypassBlock->getTerminator()); + // We need to test whether the backedge-taken count is uint##_max. Adding one + // to it will cause overflow and an incorrect loop trip count in the vector + // body. In case of overflow we want to directly jump to the scalar remainder + // loop. + Value *BackedgeCount = + Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(), + BypassBlock->getTerminator()); + if (BackedgeCount->getType()->isPointerTy()) + BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy, + "backedge.ptrcnt.to.int", + BypassBlock->getTerminator()); + Instruction *CheckBCOverflow = + CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount, + Constant::getAllOnesValue(BackedgeCount->getType()), + "backedge.overflow", BypassBlock->getTerminator()); // The loop index does not have to start at Zero. Find the original start // value from the induction PHI node. If we don't have an induction variable @@ -1568,7 +2153,18 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { IdxTy): ConstantInt::get(IdxTy, 0); - assert(BypassBlock && "Invalid loop structure"); + // We need an instruction to anchor the overflow check on. StartIdx needs to + // be defined before the overflow check branch. Because the scalar preheader + // is going to merge the start index and so the overflow branch block needs to + // contain a definition of the start index. + Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd( + StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor", + BypassBlock->getTerminator()); + + // Count holds the overall loop count (N). + Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), + BypassBlock->getTerminator()); + LoopBypassBlocks.push_back(BypassBlock); // Split the single block loop into the two loop structure described above. @@ -1637,27 +2233,69 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Now, compare the new count to zero. If it is zero skip the vector loop and // jump to the scalar loop. - Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, - "cmp.zero"); + Value *Cmp = + BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero"); BasicBlock *LastBypassBlock = BypassBlock; + // Generate code to check that the loops trip count that we computed by adding + // one to the backedge-taken count will not overflow. + { + auto PastOverflowCheck = + std::next(BasicBlock::iterator(OverflowCheckAnchor)); + BasicBlock *CheckBlock = + LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked"); + if (ParentLoop) + ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); + LoopBypassBlocks.push_back(CheckBlock); + Instruction *OldTerm = LastBypassBlock->getTerminator(); + BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm); + OldTerm->eraseFromParent(); + LastBypassBlock = CheckBlock; + } + + // Generate the code to check that the strides we assumed to be one are really + // one. We want the new basic block to start at the first instruction in a + // sequence of instructions that form a check. + Instruction *StrideCheck; + Instruction *FirstCheckInst; + std::tie(FirstCheckInst, StrideCheck) = + addStrideCheck(LastBypassBlock->getTerminator()); + if (StrideCheck) { + // Create a new block containing the stride check. + BasicBlock *CheckBlock = + LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck"); + if (ParentLoop) + ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); + LoopBypassBlocks.push_back(CheckBlock); + + // Replace the branch into the memory check block with a conditional branch + // for the "few elements case". + Instruction *OldTerm = LastBypassBlock->getTerminator(); + BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); + OldTerm->eraseFromParent(); + + Cmp = StrideCheck; + LastBypassBlock = CheckBlock; + } + // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - Instruction *MemRuntimeCheck = addRuntimeCheck(Legal, - BypassBlock->getTerminator()); + Instruction *MemRuntimeCheck; + std::tie(FirstCheckInst, MemRuntimeCheck) = + addRuntimeCheck(LastBypassBlock->getTerminator()); if (MemRuntimeCheck) { // Create a new block containing the memory check. - BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck, - "vector.memcheck"); + BasicBlock *CheckBlock = + LastBypassBlock->splitBasicBlock(MemRuntimeCheck, "vector.memcheck"); if (ParentLoop) ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); LoopBypassBlocks.push_back(CheckBlock); // Replace the branch into the memory check block with a conditional branch // for the "few elements case". - Instruction *OldTerm = BypassBlock->getTerminator(); + Instruction *OldTerm = LastBypassBlock->getTerminator(); BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); OldTerm->eraseFromParent(); @@ -1678,7 +2316,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // start value. // This variable saves the new starting index for the scalar loop. - PHINode *ResumeIndex = 0; + PHINode *ResumeIndex = nullptr; LoopVectorizationLegality::InductionList::iterator I, E; LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); // Set builder to point to last bypass block. @@ -1694,9 +2332,22 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // truncated version for the scalar loop. PHINode *TruncResumeVal = (OrigPhi == OldInduction) ? PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", - MiddleBlock->getTerminator()) : 0; + MiddleBlock->getTerminator()) : nullptr; + + // Create phi nodes to merge from the backedge-taken check block. + PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val", + ScalarPH->getTerminator()); + BCResumeVal->addIncoming(ResumeVal, MiddleBlock); + + PHINode *BCTruncResumeVal = nullptr; + if (OrigPhi == OldInduction) { + BCTruncResumeVal = + PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val", + ScalarPH->getTerminator()); + BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock); + } - Value *EndValue = 0; + Value *EndValue = nullptr; switch (II.IK) { case LoopVectorizationLegality::IK_NoInduction: llvm_unreachable("Unknown induction"); @@ -1712,10 +2363,12 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); TruncResumeVal->addIncoming(EndValue, VecBody); + BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); + // We know what the end value is. EndValue = IdxEndRoundDown; // We also know which PHI node holds it. @@ -1761,7 +2414,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) { + for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) { if (OrigPhi == OldInduction) ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); else @@ -1771,11 +2424,16 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Fix the scalar body counter (PHI node). unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); - // The old inductions phi node in the scalar body needs the truncated value. - if (OrigPhi == OldInduction) - OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal); - else - OrigPhi->setIncomingValue(BlockIdx, ResumeVal); + + // The old induction's phi node in the scalar body needs the truncated + // value. + if (OrigPhi == OldInduction) { + BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]); + OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal); + } else { + BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); + OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); + } } // If we are generating a new induction variable then we also need to @@ -1786,7 +2444,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { assert(!ResumeIndex && "Unexpected resume value found"); ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", MiddleBlock->getTerminator()); - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); } @@ -1825,7 +2483,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { LoopScalarPreHeader = ScalarPH; LoopMiddleBlock = MiddleBlock; LoopExitBlock = ExitBlock; - LoopVectorBody = VecBody; + LoopVectorBody.push_back(VecBody); LoopScalarBody = OldBasicBlock; LoopVectorizeHints Hints(Lp, true); @@ -1859,148 +2517,6 @@ LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) { } } -static Intrinsic::ID checkUnaryFloatSignature(const CallInst &I, - Intrinsic::ID ValidIntrinsicID) { - if (I.getNumArgOperands() != 1 || - !I.getArgOperand(0)->getType()->isFloatingPointTy() || - I.getType() != I.getArgOperand(0)->getType() || - !I.onlyReadsMemory()) - return Intrinsic::not_intrinsic; - - return ValidIntrinsicID; -} - -static Intrinsic::ID checkBinaryFloatSignature(const CallInst &I, - Intrinsic::ID ValidIntrinsicID) { - if (I.getNumArgOperands() != 2 || - !I.getArgOperand(0)->getType()->isFloatingPointTy() || - !I.getArgOperand(1)->getType()->isFloatingPointTy() || - I.getType() != I.getArgOperand(0)->getType() || - I.getType() != I.getArgOperand(1)->getType() || - !I.onlyReadsMemory()) - return Intrinsic::not_intrinsic; - - return ValidIntrinsicID; -} - - -static Intrinsic::ID -getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { - // If we have an intrinsic call, check if it is trivially vectorizable. - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) { - switch (II->getIntrinsicID()) { - case Intrinsic::sqrt: - case Intrinsic::sin: - case Intrinsic::cos: - case Intrinsic::exp: - case Intrinsic::exp2: - case Intrinsic::log: - case Intrinsic::log10: - case Intrinsic::log2: - case Intrinsic::fabs: - case Intrinsic::copysign: - case Intrinsic::floor: - case Intrinsic::ceil: - case Intrinsic::trunc: - case Intrinsic::rint: - case Intrinsic::nearbyint: - case Intrinsic::round: - case Intrinsic::pow: - case Intrinsic::fma: - case Intrinsic::fmuladd: - case Intrinsic::lifetime_start: - case Intrinsic::lifetime_end: - return II->getIntrinsicID(); - default: - return Intrinsic::not_intrinsic; - } - } - - if (!TLI) - return Intrinsic::not_intrinsic; - - LibFunc::Func Func; - Function *F = CI->getCalledFunction(); - // We're going to make assumptions on the semantics of the functions, check - // that the target knows that it's available in this environment and it does - // not have local linkage. - if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(F->getName(), Func)) - return Intrinsic::not_intrinsic; - - // Otherwise check if we have a call to a function that can be turned into a - // vector intrinsic. - switch (Func) { - default: - break; - case LibFunc::sin: - case LibFunc::sinf: - case LibFunc::sinl: - return checkUnaryFloatSignature(*CI, Intrinsic::sin); - case LibFunc::cos: - case LibFunc::cosf: - case LibFunc::cosl: - return checkUnaryFloatSignature(*CI, Intrinsic::cos); - case LibFunc::exp: - case LibFunc::expf: - case LibFunc::expl: - return checkUnaryFloatSignature(*CI, Intrinsic::exp); - case LibFunc::exp2: - case LibFunc::exp2f: - case LibFunc::exp2l: - return checkUnaryFloatSignature(*CI, Intrinsic::exp2); - case LibFunc::log: - case LibFunc::logf: - case LibFunc::logl: - return checkUnaryFloatSignature(*CI, Intrinsic::log); - case LibFunc::log10: - case LibFunc::log10f: - case LibFunc::log10l: - return checkUnaryFloatSignature(*CI, Intrinsic::log10); - case LibFunc::log2: - case LibFunc::log2f: - case LibFunc::log2l: - return checkUnaryFloatSignature(*CI, Intrinsic::log2); - case LibFunc::fabs: - case LibFunc::fabsf: - case LibFunc::fabsl: - return checkUnaryFloatSignature(*CI, Intrinsic::fabs); - case LibFunc::copysign: - case LibFunc::copysignf: - case LibFunc::copysignl: - return checkBinaryFloatSignature(*CI, Intrinsic::copysign); - case LibFunc::floor: - case LibFunc::floorf: - case LibFunc::floorl: - return checkUnaryFloatSignature(*CI, Intrinsic::floor); - case LibFunc::ceil: - case LibFunc::ceilf: - case LibFunc::ceill: - return checkUnaryFloatSignature(*CI, Intrinsic::ceil); - case LibFunc::trunc: - case LibFunc::truncf: - case LibFunc::truncl: - return checkUnaryFloatSignature(*CI, Intrinsic::trunc); - case LibFunc::rint: - case LibFunc::rintf: - case LibFunc::rintl: - return checkUnaryFloatSignature(*CI, Intrinsic::rint); - case LibFunc::nearbyint: - case LibFunc::nearbyintf: - case LibFunc::nearbyintl: - return checkUnaryFloatSignature(*CI, Intrinsic::nearbyint); - case LibFunc::round: - case LibFunc::roundf: - case LibFunc::roundl: - return checkUnaryFloatSignature(*CI, Intrinsic::round); - case LibFunc::pow: - case LibFunc::powf: - case LibFunc::powl: - return checkBinaryFloatSignature(*CI, Intrinsic::pow); - } - - return Intrinsic::not_intrinsic; -} - /// This function translates the reduction kind to an LLVM binary operator. static unsigned getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { @@ -2093,30 +2609,56 @@ struct CSEDenseMapInfo { }; } +/// \brief Check whether this block is a predicated block. +/// Due to if predication of stores we might create a sequence of "if(pred) a[i] +/// = ...; " blocks. We start with one vectorized basic block. For every +/// conditional block we split this vectorized block. Therefore, every second +/// block will be a predicated one. +static bool isPredicatedBlock(unsigned BlockNum) { + return BlockNum % 2; +} + ///\brief Perform cse of induction variable instructions. -static void cse(BasicBlock *BB) { +static void cse(SmallVector<BasicBlock *, 4> &BBs) { // Perform simple cse. SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { - Instruction *In = I++; + for (unsigned i = 0, e = BBs.size(); i != e; ++i) { + BasicBlock *BB = BBs[i]; + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { + Instruction *In = I++; - if (!CSEDenseMapInfo::canHandle(In)) - continue; + if (!CSEDenseMapInfo::canHandle(In)) + continue; - // Check if we can replace this instruction with any of the - // visited instructions. - if (Instruction *V = CSEMap.lookup(In)) { - In->replaceAllUsesWith(V); - In->eraseFromParent(); - continue; + // Check if we can replace this instruction with any of the + // visited instructions. + if (Instruction *V = CSEMap.lookup(In)) { + In->replaceAllUsesWith(V); + In->eraseFromParent(); + continue; + } + // Ignore instructions in conditional blocks. We create "if (pred) a[i] = + // ...;" blocks for predicated stores. Every second block is a predicated + // block. + if (isPredicatedBlock(i)) + continue; + + CSEMap[In] = In; } + } +} - CSEMap[In] = In; +/// \brief Adds a 'fast' flag to floating point operations. +static Value *addFastMathFlag(Value *V) { + if (isa<FPMathOperator>(V)){ + FastMathFlags Flags; + Flags.setUnsafeAlgebra(); + cast<Instruction>(V)->setFastMathFlags(Flags); } + return V; } -void -InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { +void InnerLoopVectorizer::vectorizeLoop() { //===------------------------------------------------===// // // Notice: any optimization or new instruction that go @@ -2144,7 +2686,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Vectorize all of the blocks in the original loop. for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), be = DFS.endRPO(); bb != be; ++bb) - vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix); + vectorizeBlockInLoop(*bb, &RdxPHIsToFix); // At this point every instruction in the original loop is widened to // a vector form. We are almost done. Now, we need to fix the PHI nodes @@ -2169,10 +2711,10 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { setDebugLocFromInst(Builder, RdxDesc.StartValue); // We need to generate a reduction vector from the incoming scalar. - // To do so, we need to generate the 'identity' vector and overide + // To do so, we need to generate the 'identity' vector and override // one of the elements with the incoming scalar reduction. We need // to do it in the vector-loop preheader. - Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator()); + Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); // This is the vector-clone of the value that leaves the loop. VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr); @@ -2228,7 +2770,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // first unroll part. Value *StartVal = (part == 0) ? VectorStart : Identity; cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader); - cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody); + cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], + LoopVectorBody.back()); } // Before each round, move the insertion point right between @@ -2245,9 +2788,10 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr); PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); Value *StartVal = (part == 0) ? VectorStart : Identity; - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); - NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody); + NewPhi->addIncoming(RdxExitVal[part], + LoopVectorBody.back()); RdxParts.push_back(NewPhi); } @@ -2257,9 +2801,10 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { setDebugLocFromInst(Builder, ReducedPartRdx); for (unsigned part = 1; part < UF; ++part) { if (Op != Instruction::ICmp && Op != Instruction::FCmp) - ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op, - RdxParts[part], ReducedPartRdx, - "bin.rdx"); + // Floating point operations had to be 'fast' to enable the reduction. + ReducedPartRdx = addFastMathFlag( + Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], + ReducedPartRdx, "bin.rdx")); else ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind, ReducedPartRdx, RdxParts[part]); @@ -2272,7 +2817,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { assert(isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!"); Value *TmpVec = ReducedPartRdx; - SmallVector<Constant*, 32> ShuffleMask(VF, 0); + SmallVector<Constant*, 32> ShuffleMask(VF, nullptr); for (unsigned i = VF; i != 1; i >>= 1) { // Move the upper half of the vector to the lower half. for (unsigned j = 0; j != i/2; ++j) @@ -2289,8 +2834,9 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { "rdx.shuf"); if (Op != Instruction::ICmp && Op != Instruction::FCmp) - TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, - "bin.rdx"); + // Floating point operations had to be 'fast' to enable the reduction. + TmpVec = addFastMathFlag(Builder.CreateBinOp( + (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); else TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); } @@ -2300,6 +2846,13 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Builder.getInt32(0)); } + // Create a phi node that merges control-flow from the backedge-taken check + // block and the middle block. + PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx", + LoopScalarPreHeader->getTerminator()); + BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]); + BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); + // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. // We know that the loop is in LCSSA form. We need to update the @@ -2329,7 +2882,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); // Pick the other block. int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); - (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx); + (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); }// end of for each redux variable. @@ -2411,7 +2964,6 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, - LoopVectorizationLegality *Legal, unsigned UF, unsigned VF, PhiVector *PV) { PHINode* P = cast<PHINode>(PN); // Handle reduction variables: @@ -2421,7 +2973,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Type *VecTy = (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", - LoopVectorBody-> getFirstInsertionPt()); + LoopVectorBody.back()-> getFirstInsertionPt()); } PV->push_back(P); return; @@ -2430,7 +2982,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, setDebugLocFromInst(Builder, P); // Check for PHI nodes that are lowered to vector selects. if (P->getParent() != OrigLoop->getHeader()) { - // We know that all PHIs in non header blocks are converted into + // We know that all PHIs in non-header blocks are converted into // selects, so we don't have to worry about the insertion order and we // can just use the builder. // At this point we generate the predication tree. There may be @@ -2573,9 +3125,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, } } -void -InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, - BasicBlock *BB, PhiVector *PV) { +void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { VectorParts &Entry = WidenMap.get(it); @@ -2586,7 +3136,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, continue; case Instruction::PHI:{ // Vectorize PHINodes. - widenPHIInstruction(it, Entry, Legal, UF, VF, PV); + widenPHIInstruction(it, Entry, UF, VF, PV); continue; }// End of PHI. @@ -2627,8 +3177,14 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, if (VecOp && isa<PossiblyExactOperator>(VecOp)) VecOp->setIsExact(BinOp->isExact()); + // Copy the fast-math flags. + if (VecOp && isa<FPMathOperator>(V)) + VecOp->setFastMathFlags(it->getFastMathFlags()); + Entry[Part] = V; } + + propagateMetadata(Entry, it); break; } case Instruction::Select: { @@ -2656,6 +3212,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, Op0[Part], Op1[Part]); } + + propagateMetadata(Entry, it); break; } @@ -2668,19 +3226,21 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, VectorParts &A = getVectorValue(it->getOperand(0)); VectorParts &B = getVectorValue(it->getOperand(1)); for (unsigned Part = 0; Part < UF; ++Part) { - Value *C = 0; + Value *C = nullptr; if (FCmp) C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); else C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); Entry[Part] = C; } + + propagateMetadata(Entry, it); break; } case Instruction::Store: case Instruction::Load: - vectorizeMemoryInstruction(it, Legal); + vectorizeMemoryInstruction(it); break; case Instruction::ZExt: case Instruction::SExt: @@ -2707,6 +3267,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, Value *Broadcasted = getBroadcastInstrs(ScalarCast); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false); + propagateMetadata(Entry, it); break; } /// Vectorize casts. @@ -2716,6 +3277,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, VectorParts &A = getVectorValue(it->getOperand(0)); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); + propagateMetadata(Entry, it); break; } @@ -2735,9 +3297,14 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, scalarizeInstruction(it); break; default: + bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1); for (unsigned Part = 0; Part < UF; ++Part) { SmallVector<Value *, 4> Args; for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + if (HasScalarOpd && i == 1) { + Args.push_back(CI->getArgOperand(i)); + continue; + } VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); Args.push_back(Arg[Part]); } @@ -2748,6 +3315,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, Function *F = Intrinsic::getDeclaration(M, ID, Tys); Entry[Part] = Builder.CreateCall(F, Args); } + + propagateMetadata(Entry, it); break; } break; @@ -2772,13 +3341,25 @@ void InnerLoopVectorizer::updateAnalysis() { for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]); DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); - DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); - DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front()); - DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); + + // Due to if predication of stores we might create a sequence of "if(pred) + // a[i] = ...; " blocks. + for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) { + if (i == 0) + DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); + else if (isPredicatedBlock(i)) { + DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]); + } else { + DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]); + } + } + + DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]); + DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); - DEBUG(DT->verifyAnalysis()); + DEBUG(DT->verifyDomTree()); } /// \brief Check whether it is safe to if-convert this phi node. @@ -2799,8 +3380,10 @@ static bool canIfConvertPHINodes(BasicBlock *BB) { } bool LoopVectorizationLegality::canVectorizeWithIfConvert() { - if (!EnableIfConversion) + if (!EnableIfConversion) { + emitAnalysis(Report() << "if-conversion is disabled"); return false; + } assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); @@ -2830,16 +3413,24 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { BasicBlock *BB = *BI; // We don't support switch statements inside loops. - if (!isa<BranchInst>(BB->getTerminator())) + if (!isa<BranchInst>(BB->getTerminator())) { + emitAnalysis(Report(BB->getTerminator()) + << "loop contains a switch statement"); return false; + } // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB)) { - if (!blockCanBePredicated(BB, SafePointes)) + if (!blockCanBePredicated(BB, SafePointes)) { + emitAnalysis(Report(BB->getTerminator()) + << "control flow cannot be substituted for a select"); return false; - } else if (BB != Header && !canIfConvertPHINodes(BB)) + } + } else if (BB != Header && !canIfConvertPHINodes(BB)) { + emitAnalysis(Report(BB->getTerminator()) + << "control flow cannot be substituted for a select"); return false; - + } } // We can if-convert this loop. @@ -2849,26 +3440,37 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { bool LoopVectorizationLegality::canVectorize() { // We must have a loop in canonical form. Loops with indirectbr in them cannot // be canonicalized. - if (!TheLoop->getLoopPreheader()) + if (!TheLoop->getLoopPreheader()) { + emitAnalysis( + Report() << "loop control flow is not understood by vectorizer"); return false; + } // We can only vectorize innermost loops. - if (TheLoop->getSubLoopsVector().size()) + if (TheLoop->getSubLoopsVector().size()) { + emitAnalysis(Report() << "loop is not the innermost loop"); return false; + } // We must have a single backedge. - if (TheLoop->getNumBackEdges() != 1) + if (TheLoop->getNumBackEdges() != 1) { + emitAnalysis( + Report() << "loop control flow is not understood by vectorizer"); return false; + } // We must have a single exiting block. - if (!TheLoop->getExitingBlock()) + if (!TheLoop->getExitingBlock()) { + emitAnalysis( + Report() << "loop control flow is not understood by vectorizer"); return false; + } // We need to have a loop header. DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() << '\n'); - // Check if we can if-convert non single-bb loops. + // Check if we can if-convert non-single-bb loops. unsigned NumBlocks = TheLoop->getNumBlocks(); if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); @@ -2878,19 +3480,11 @@ bool LoopVectorizationLegality::canVectorize() { // ScalarEvolution needs to be able to find the exit count. const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); if (ExitCount == SE->getCouldNotCompute()) { + emitAnalysis(Report() << "could not determine number of loop iterations"); DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; } - // Do not loop-vectorize loops with a tiny trip count. - BasicBlock *Latch = TheLoop->getLoopLatch(); - unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); - if (TC > 0u && TC < TinyTripCountVectorThreshold) { - DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << - "This loop is not worth vectorizing.\n"); - return false; - } - // Check if we can vectorize the instructions and CFG in this loop. if (!canVectorizeInstrs()) { DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); @@ -2916,7 +3510,7 @@ bool LoopVectorizationLegality::canVectorize() { return true; } -static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) { +static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { if (Ty->isPointerTy()) return DL.getIntPtrType(Ty); @@ -2928,7 +3522,7 @@ static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) { return Ty; } -static Type* getWiderType(DataLayout &DL, Type *Ty0, Type *Ty1) { +static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { Ty0 = convertPointerToIntegerType(DL, Ty0); Ty1 = convertPointerToIntegerType(DL, Ty1); if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) @@ -2944,12 +3538,11 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, // instructions must not have external users. if (!Reductions.count(Inst)) //Check that all of the users of the loop are inside the BB. - for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end(); - I != E; ++I) { - Instruction *U = cast<Instruction>(*I); + for (User *U : Inst->users()) { + Instruction *UI = cast<Instruction>(U); // This user may be a reduction exit value. - if (!TheLoop->contains(U)) { - DEBUG(dbgs() << "LV: Found an outside user for : " << *U << '\n'); + if (!TheLoop->contains(UI)) { + DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n'); return true; } } @@ -2981,6 +3574,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && !PhiTy->isPointerTy()) { + emitAnalysis(Report(it) + << "loop control flow is not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; } @@ -2991,13 +3586,17 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (*bb != Header) { // Check that this instruction has no outside users or is an // identified reduction value with an outside user. - if(!hasOutsideLoopUser(TheLoop, it, AllowedExit)) + if (!hasOutsideLoopUser(TheLoop, it, AllowedExit)) continue; + emitAnalysis(Report(it) << "value that could not be identified as " + "reduction is used outside the loop"); return false; } // We only allow if-converted PHIs with more than two incoming values. if (Phi->getNumIncomingValues() != 2) { + emitAnalysis(Report(it) + << "control flow not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; } @@ -3028,8 +3627,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Until we explicitly handle the case of an induction variable with // an outside loop user we have to give up vectorizing this loop. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) + if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { + emitAnalysis(Report(it) << "use of induction value outside of the " + "loop is not handled by vectorizer"); return false; + } continue; } @@ -3072,6 +3674,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { continue; } + emitAnalysis(Report(it) << "unvectorizable operation"); DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); return false; }// end of PHI handling @@ -3080,14 +3683,29 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // calls and we do handle certain intrinsic and libm functions. CallInst *CI = dyn_cast<CallInst>(it); if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) { + emitAnalysis(Report(it) << "call instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found a call site.\n"); return false; } + // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the + // second argument is the same (i.e. loop invariant) + if (CI && + hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { + if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { + emitAnalysis(Report(it) + << "intrinsic instruction cannot be vectorized"); + DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); + return false; + } + } + // Check that the instruction return type is vectorizable. // Also, we can't vectorize extractelement instructions. if ((!VectorType::isValidElementType(it->getType()) && !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) { + emitAnalysis(Report(it) + << "instruction return type cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; } @@ -3095,14 +3713,24 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Check that the stored type is vectorizable. if (StoreInst *ST = dyn_cast<StoreInst>(it)) { Type *T = ST->getValueOperand()->getType(); - if (!VectorType::isValidElementType(T)) + if (!VectorType::isValidElementType(T)) { + emitAnalysis(Report(ST) << "store instruction cannot be vectorized"); return false; + } + if (EnableMemAccessVersioning) + collectStridedAcccess(ST); } + if (EnableMemAccessVersioning) + if (LoadInst *LI = dyn_cast<LoadInst>(it)) + collectStridedAcccess(LI); + // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) + if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { + emitAnalysis(Report(it) << "value cannot be used outside the loop"); return false; + } } // next instr. @@ -3110,13 +3738,148 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!Induction) { DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); - if (Inductions.empty()) + if (Inductions.empty()) { + emitAnalysis(Report() + << "loop induction variable could not be identified"); return false; + } } return true; } +///\brief Remove GEPs whose indices but the last one are loop invariant and +/// return the induction operand of the gep pointer. +static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, + const DataLayout *DL, Loop *Lp) { + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); + if (!GEP) + return Ptr; + + unsigned InductionOperand = getGEPInductionOperand(DL, GEP); + + // Check that all of the gep indices are uniform except for our induction + // operand. + for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) + if (i != InductionOperand && + !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp)) + return Ptr; + return GEP->getOperand(InductionOperand); +} + +///\brief Look for a cast use of the passed value. +static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) { + Value *UniqueCast = nullptr; + for (User *U : Ptr->users()) { + CastInst *CI = dyn_cast<CastInst>(U); + if (CI && CI->getType() == Ty) { + if (!UniqueCast) + UniqueCast = CI; + else + return nullptr; + } + } + return UniqueCast; +} + +///\brief Get the stride of a pointer access in a loop. +/// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a +/// pointer to the Value, or null otherwise. +static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, + const DataLayout *DL, Loop *Lp) { + const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); + if (!PtrTy || PtrTy->isAggregateType()) + return nullptr; + + // Try to remove a gep instruction to make the pointer (actually index at this + // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the + // pointer, otherwise, we are analyzing the index. + Value *OrigPtr = Ptr; + + // The size of the pointer access. + int64_t PtrAccessSize = 1; + + Ptr = stripGetElementPtr(Ptr, SE, DL, Lp); + const SCEV *V = SE->getSCEV(Ptr); + + if (Ptr != OrigPtr) + // Strip off casts. + while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) + V = C->getOperand(); + + const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V); + if (!S) + return nullptr; + + V = S->getStepRecurrence(*SE); + if (!V) + return nullptr; + + // Strip off the size of access multiplication if we are still analyzing the + // pointer. + if (OrigPtr == Ptr) { + DL->getTypeAllocSize(PtrTy->getElementType()); + if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) { + if (M->getOperand(0)->getSCEVType() != scConstant) + return nullptr; + + const APInt &APStepVal = + cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue(); + + // Huge step value - give up. + if (APStepVal.getBitWidth() > 64) + return nullptr; + + int64_t StepVal = APStepVal.getSExtValue(); + if (PtrAccessSize != StepVal) + return nullptr; + V = M->getOperand(1); + } + } + + // Strip off casts. + Type *StripedOffRecurrenceCast = nullptr; + if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) { + StripedOffRecurrenceCast = C->getType(); + V = C->getOperand(); + } + + // Look for the loop invariant symbolic value. + const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V); + if (!U) + return nullptr; + + Value *Stride = U->getValue(); + if (!Lp->isLoopInvariant(Stride)) + return nullptr; + + // If we have stripped off the recurrence cast we have to make sure that we + // return the value that is used in this loop so that we can replace it later. + if (StripedOffRecurrenceCast) + Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast); + + return Stride; +} + +void LoopVectorizationLegality::collectStridedAcccess(Value *MemAccess) { + Value *Ptr = nullptr; + if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess)) + Ptr = LI->getPointerOperand(); + else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess)) + Ptr = SI->getPointerOperand(); + else + return; + + Value *Stride = getStrideFromPointer(Ptr, SE, DL, TheLoop); + if (!Stride) + return; + + DEBUG(dbgs() << "LV: Found a strided access that we can version"); + DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n"); + Strides[Ptr] = Stride; + StrideSet.insert(Stride); +} + void LoopVectorizationLegality::collectLoopUniforms() { // We now know that the loop is vectorizable! // Collect variables that will remain uniform after vectorization. @@ -3126,6 +3889,16 @@ void LoopVectorizationLegality::collectLoopUniforms() { // Start with the conditional branch and walk up the block. Worklist.push_back(Latch->getTerminator()->getOperand(0)); + // Also add all consecutive pointer values; these values will be uniform + // after vectorization (and subsequent cleanup) and, until revectorization is + // supported, all dependencies must also be uniform. + for (Loop::block_iterator B = TheLoop->block_begin(), + BE = TheLoop->block_end(); B != BE; ++B) + for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end(); + I != IE; ++I) + if (I->getType()->isPointerTy() && isConsecutivePtr(I)) + Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); + while (Worklist.size()) { Instruction *I = dyn_cast<Instruction>(Worklist.back()); Worklist.pop_back(); @@ -3158,19 +3931,22 @@ public: /// \brief Set of potential dependent memory accesses. typedef EquivalenceClasses<MemAccessInfo> DepCandidates; - AccessAnalysis(DataLayout *Dl, DepCandidates &DA) : - DL(Dl), DepCands(DA), AreAllWritesIdentified(true), - AreAllReadsIdentified(true), IsRTCheckNeeded(false) {} + AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) : + DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {} /// \brief Register a load and whether it is only read from. - void addLoad(Value *Ptr, bool IsReadOnly) { + void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) { + Value *Ptr = const_cast<Value*>(Loc.Ptr); + AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.TBAATag); Accesses.insert(MemAccessInfo(Ptr, false)); if (IsReadOnly) ReadOnlyPtr.insert(Ptr); } /// \brief Register a store. - void addStore(Value *Ptr) { + void addStore(AliasAnalysis::Location &Loc) { + Value *Ptr = const_cast<Value*>(Loc.Ptr); + AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.TBAATag); Accesses.insert(MemAccessInfo(Ptr, true)); } @@ -3178,15 +3954,13 @@ public: /// non-intersection. bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck, unsigned &NumComparisons, ScalarEvolution *SE, - Loop *TheLoop, bool ShouldCheckStride = false); + Loop *TheLoop, ValueToValueMap &Strides, + bool ShouldCheckStride = false); /// \brief Goes over all memory accesses, checks whether a RT check is needed /// and builds sets of dependent accesses. void buildDependenceSets() { - // Process read-write pointers first. - processMemAccesses(false); - // Next, process read pointers. - processMemAccesses(true); + processMemAccesses(); } bool isRTCheckNeeded() { return IsRTCheckNeeded; } @@ -3198,48 +3972,40 @@ public: private: typedef SetVector<MemAccessInfo> PtrAccessSet; - typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap; - /// \brief Go over all memory access or only the deferred ones if - /// \p UseDeferred is true and check whether runtime pointer checks are needed - /// and build sets of dependency check candidates. - void processMemAccesses(bool UseDeferred); + /// \brief Go over all memory access and check whether runtime pointer checks + /// are needed /// and build sets of dependency check candidates. + void processMemAccesses(); /// Set of all accesses. PtrAccessSet Accesses; - /// Set of access to check after all writes have been processed. - PtrAccessSet DeferredAccesses; - - /// Map of pointers to last access encountered. - UnderlyingObjToAccessMap ObjToLastAccess; - /// Set of accesses that need a further dependence check. MemAccessInfoSet CheckDeps; /// Set of pointers that are read only. SmallPtrSet<Value*, 16> ReadOnlyPtr; - /// Set of underlying objects already written to. - SmallPtrSet<Value*, 16> WriteObjects; + const DataLayout *DL; - DataLayout *DL; + /// An alias set tracker to partition the access set by underlying object and + //intrinsic property (such as TBAA metadata). + AliasSetTracker AST; /// Sets of potentially dependent accesses - members of one set share an /// underlying pointer. The set "CheckDeps" identfies which sets really need a /// dependence check. DepCandidates &DepCands; - bool AreAllWritesIdentified; - bool AreAllReadsIdentified; bool IsRTCheckNeeded; }; } // end anonymous namespace /// \brief Check whether a pointer can participate in a runtime bounds check. -static bool hasComputableBounds(ScalarEvolution *SE, Value *Ptr) { - const SCEV *PtrScev = SE->getSCEV(Ptr); +static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides, + Value *Ptr) { + const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); if (!AR) return false; @@ -3249,70 +4015,76 @@ static bool hasComputableBounds(ScalarEvolution *SE, Value *Ptr) { /// \brief Check the stride of the pointer and ensure that it does not wrap in /// the address space. -static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr, - const Loop *Lp); +static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, + const Loop *Lp, ValueToValueMap &StridesMap); bool AccessAnalysis::canCheckPtrAtRT( - LoopVectorizationLegality::RuntimePointerCheck &RtCheck, - unsigned &NumComparisons, ScalarEvolution *SE, - Loop *TheLoop, bool ShouldCheckStride) { + LoopVectorizationLegality::RuntimePointerCheck &RtCheck, + unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop, + ValueToValueMap &StridesMap, bool ShouldCheckStride) { // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. - unsigned NumReadPtrChecks = 0; - unsigned NumWritePtrChecks = 0; bool CanDoRT = true; bool IsDepCheckNeeded = isDependencyCheckNeeded(); - // We assign consecutive id to access from different dependence sets. - // Accesses within the same set don't need a runtime check. - unsigned RunningDepId = 1; - DenseMap<Value *, unsigned> DepSetId; - - for (PtrAccessSet::iterator AI = Accesses.begin(), AE = Accesses.end(); - AI != AE; ++AI) { - const MemAccessInfo &Access = *AI; - Value *Ptr = Access.getPointer(); - bool IsWrite = Access.getInt(); - - // Just add write checks if we have both. - if (!IsWrite && Accesses.count(MemAccessInfo(Ptr, true))) - continue; + NumComparisons = 0; - if (IsWrite) - ++NumWritePtrChecks; - else - ++NumReadPtrChecks; - - if (hasComputableBounds(SE, Ptr) && - // When we run after a failing dependency check we have to make sure we - // don't have wrapping pointers. - (!ShouldCheckStride || isStridedPtr(SE, DL, Ptr, TheLoop) == 1)) { - // The id of the dependence set. - unsigned DepId; - - if (IsDepCheckNeeded) { - Value *Leader = DepCands.getLeaderValue(Access).getPointer(); - unsigned &LeaderId = DepSetId[Leader]; - if (!LeaderId) - LeaderId = RunningDepId++; - DepId = LeaderId; - } else - // Each access has its own dependence set. - DepId = RunningDepId++; - - RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId); - - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n'); - } else { - CanDoRT = false; + // We assign a consecutive id to access from different alias sets. + // Accesses between different groups doesn't need to be checked. + unsigned ASId = 1; + for (auto &AS : AST) { + unsigned NumReadPtrChecks = 0; + unsigned NumWritePtrChecks = 0; + + // We assign consecutive id to access from different dependence sets. + // Accesses within the same set don't need a runtime check. + unsigned RunningDepId = 1; + DenseMap<Value *, unsigned> DepSetId; + + for (auto A : AS) { + Value *Ptr = A.getValue(); + bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true)); + MemAccessInfo Access(Ptr, IsWrite); + + if (IsWrite) + ++NumWritePtrChecks; + else + ++NumReadPtrChecks; + + if (hasComputableBounds(SE, StridesMap, Ptr) && + // When we run after a failing dependency check we have to make sure we + // don't have wrapping pointers. + (!ShouldCheckStride || + isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) { + // The id of the dependence set. + unsigned DepId; + + if (IsDepCheckNeeded) { + Value *Leader = DepCands.getLeaderValue(Access).getPointer(); + unsigned &LeaderId = DepSetId[Leader]; + if (!LeaderId) + LeaderId = RunningDepId++; + DepId = LeaderId; + } else + // Each access has its own dependence set. + DepId = RunningDepId++; + + RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap); + + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n'); + } else { + CanDoRT = false; + } } - } - if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) - NumComparisons = 0; // Only one dependence set. - else { - NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks + - NumWritePtrChecks - 1)); + if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) + NumComparisons += 0; // Only one dependence set. + else { + NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks + + NumWritePtrChecks - 1)); + } + + ++ASId; } // If the pointers that we would use for the bounds comparison have different @@ -3326,6 +4098,9 @@ bool AccessAnalysis::canCheckPtrAtRT( // Only need to check pointers between two different dependency sets. if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j]) continue; + // Only need to check pointers in the same alias set. + if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j]) + continue; Value *PtrI = RtCheck.Pointers[i]; Value *PtrJ = RtCheck.Pointers[j]; @@ -3343,90 +4118,99 @@ bool AccessAnalysis::canCheckPtrAtRT( return CanDoRT; } -static bool isFunctionScopeIdentifiedObject(Value *Ptr) { - return isNoAliasArgument(Ptr) || isNoAliasCall(Ptr) || isa<AllocaInst>(Ptr); -} - -void AccessAnalysis::processMemAccesses(bool UseDeferred) { +void AccessAnalysis::processMemAccesses() { // We process the set twice: first we process read-write pointers, last we // process read-only pointers. This allows us to skip dependence tests for // read-only pointers. - PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; - for (PtrAccessSet::iterator AI = S.begin(), AE = S.end(); AI != AE; ++AI) { - const MemAccessInfo &Access = *AI; - Value *Ptr = Access.getPointer(); - bool IsWrite = Access.getInt(); - - DepCands.insert(Access); - - // Memorize read-only pointers for later processing and skip them in the - // first round (they need to be checked after we have seen all write - // pointers). Note: we also mark pointer that are not consecutive as - // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need the - // second check for "!IsWrite". - bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; - if (!UseDeferred && IsReadOnlyPtr) { - DeferredAccesses.insert(Access); - continue; - } + DEBUG(dbgs() << "LV: Processing memory accesses...\n"); + DEBUG(dbgs() << " AST: "; AST.dump()); + DEBUG(dbgs() << "LV: Accesses:\n"); + DEBUG({ + for (auto A : Accesses) + dbgs() << "\t" << *A.getPointer() << " (" << + (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? + "read-only" : "read")) << ")\n"; + }); + + // The AliasSetTracker has nicely partitioned our pointers by metadata + // compatibility and potential for underlying-object overlap. As a result, we + // only need to check for potential pointer dependencies within each alias + // set. + for (auto &AS : AST) { + // Note that both the alias-set tracker and the alias sets themselves used + // linked lists internally and so the iteration order here is deterministic + // (matching the original instruction order within each set). + + bool SetHasWrite = false; + + // Map of pointers to last access encountered. + typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap; + UnderlyingObjToAccessMap ObjToLastAccess; + + // Set of access to check after all writes have been processed. + PtrAccessSet DeferredAccesses; + + // Iterate over each alias set twice, once to process read/write pointers, + // and then to process read-only pointers. + for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { + bool UseDeferred = SetIteration > 0; + PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; + + for (auto A : AS) { + Value *Ptr = A.getValue(); + bool IsWrite = S.count(MemAccessInfo(Ptr, true)); + + // If we're using the deferred access set, then it contains only reads. + bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; + if (UseDeferred && !IsReadOnlyPtr) + continue; + // Otherwise, the pointer must be in the PtrAccessSet, either as a read + // or a write. + assert(((IsReadOnlyPtr && UseDeferred) || IsWrite || + S.count(MemAccessInfo(Ptr, false))) && + "Alias-set pointer not in the access set?"); + + MemAccessInfo Access(Ptr, IsWrite); + DepCands.insert(Access); + + // Memorize read-only pointers for later processing and skip them in the + // first round (they need to be checked after we have seen all write + // pointers). Note: we also mark pointer that are not consecutive as + // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need + // the second check for "!IsWrite". + if (!UseDeferred && IsReadOnlyPtr) { + DeferredAccesses.insert(Access); + continue; + } - bool NeedDepCheck = false; - // Check whether there is the possiblity of dependency because of underlying - // objects being the same. - typedef SmallVector<Value*, 16> ValueVector; - ValueVector TempObjects; - GetUnderlyingObjects(Ptr, TempObjects, DL); - for (ValueVector::iterator UI = TempObjects.begin(), UE = TempObjects.end(); - UI != UE; ++UI) { - Value *UnderlyingObj = *UI; - - // If this is a write then it needs to be an identified object. If this a - // read and all writes (so far) are identified function scope objects we - // don't need an identified underlying object but only an Argument (the - // next write is going to invalidate this assumption if it is - // unidentified). - // This is a micro-optimization for the case where all writes are - // identified and we have one argument pointer. - // Otherwise, we do need a runtime check. - if ((IsWrite && !isFunctionScopeIdentifiedObject(UnderlyingObj)) || - (!IsWrite && (!AreAllWritesIdentified || - !isa<Argument>(UnderlyingObj)) && - !isIdentifiedObject(UnderlyingObj))) { - DEBUG(dbgs() << "LV: Found an unidentified " << - (IsWrite ? "write" : "read" ) << " ptr: " << *UnderlyingObj << - "\n"); - IsRTCheckNeeded = (IsRTCheckNeeded || - !isIdentifiedObject(UnderlyingObj) || - !AreAllReadsIdentified); + // If this is a write - check other reads and writes for conflicts. If + // this is a read only check other writes for conflicts (but only if + // there is no other write to the ptr - this is an optimization to + // catch "a[i] = a[i] + " without having to do a dependence check). + if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { + CheckDeps.insert(Access); + IsRTCheckNeeded = true; + } if (IsWrite) - AreAllWritesIdentified = false; - if (!IsWrite) - AreAllReadsIdentified = false; + SetHasWrite = true; + + // Create sets of pointers connected by a shared alias set and + // underlying object. + typedef SmallVector<Value*, 16> ValueVector; + ValueVector TempObjects; + GetUnderlyingObjects(Ptr, TempObjects, DL); + for (Value *UnderlyingObj : TempObjects) { + UnderlyingObjToAccessMap::iterator Prev = + ObjToLastAccess.find(UnderlyingObj); + if (Prev != ObjToLastAccess.end()) + DepCands.unionSets(Access, Prev->second); + + ObjToLastAccess[UnderlyingObj] = Access; + } } - - // If this is a write - check other reads and writes for conflicts. If - // this is a read only check other writes for conflicts (but only if there - // is no other write to the ptr - this is an optimization to catch "a[i] = - // a[i] + " without having to do a dependence check). - if ((IsWrite || IsReadOnlyPtr) && WriteObjects.count(UnderlyingObj)) - NeedDepCheck = true; - - if (IsWrite) - WriteObjects.insert(UnderlyingObj); - - // Create sets of pointers connected by shared underlying objects. - UnderlyingObjToAccessMap::iterator Prev = - ObjToLastAccess.find(UnderlyingObj); - if (Prev != ObjToLastAccess.end()) - DepCands.unionSets(Access, Prev->second); - - ObjToLastAccess[UnderlyingObj] = Access; } - - if (NeedDepCheck) - CheckDeps.insert(Access); } } @@ -3468,7 +4252,7 @@ public: typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; - MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) + MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L) : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), ShouldRetryWithRuntimeCheck(false) {} @@ -3494,7 +4278,7 @@ public: /// /// Only checks sets with elements in \p CheckDeps. bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, - MemAccessInfoSet &CheckDeps); + MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides); /// \brief The maximum number of bytes of a vector register we can vectorize /// the accesses safely with. @@ -3506,7 +4290,7 @@ public: private: ScalarEvolution *SE; - DataLayout *DL; + const DataLayout *DL; const Loop *InnermostLoop; /// \brief Maps access locations (ptr, read/write) to program order. @@ -3521,7 +4305,7 @@ private: // We can access this many bytes in parallel safely. unsigned MaxSafeDepDistBytes; - /// \brief If we see a non constant dependence distance we can still try to + /// \brief If we see a non-constant dependence distance we can still try to /// vectorize this loop with runtime checks. bool ShouldRetryWithRuntimeCheck; @@ -3538,7 +4322,8 @@ private: /// distance is smaller than any other distance encountered so far). /// Otherwise, this function returns true signaling a possible dependence. bool isDependent(const MemAccessInfo &A, unsigned AIdx, - const MemAccessInfo &B, unsigned BIdx); + const MemAccessInfo &B, unsigned BIdx, + ValueToValueMap &Strides); /// \brief Check whether the data dependence could prevent store-load /// forwarding. @@ -3554,10 +4339,10 @@ static bool isInBoundsGep(Value *Ptr) { } /// \brief Check whether the access through \p Ptr has a constant stride. -static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr, - const Loop *Lp) { +static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, + const Loop *Lp, ValueToValueMap &StridesMap) { const Type *Ty = Ptr->getType(); - assert(Ty->isPointerTy() && "Unexpected non ptr"); + assert(Ty->isPointerTy() && "Unexpected non-ptr"); // Make sure that the pointer does not point to aggregate types. const PointerType *PtrTy = cast<PointerType>(Ty); @@ -3567,7 +4352,8 @@ static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr, return 0; } - const SCEV *PtrScev = SE->getSCEV(Ptr); + const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); if (!AR) { DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " @@ -3671,7 +4457,8 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance, } bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, - const MemAccessInfo &B, unsigned BIdx) { + const MemAccessInfo &B, unsigned BIdx, + ValueToValueMap &Strides) { assert (AIdx < BIdx && "Must pass arguments in program order"); Value *APtr = A.getPointer(); @@ -3683,11 +4470,16 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, if (!AIsWrite && !BIsWrite) return false; - const SCEV *AScev = SE->getSCEV(APtr); - const SCEV *BScev = SE->getSCEV(BPtr); + // We cannot check pointers in different address spaces. + if (APtr->getType()->getPointerAddressSpace() != + BPtr->getType()->getPointerAddressSpace()) + return true; + + const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr); + const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr); - int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop); - int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop); + int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides); + int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides); const SCEV *Src = AScev; const SCEV *Sink = BScev; @@ -3721,7 +4513,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist); if (!C) { - DEBUG(dbgs() << "LV: Dependence because of non constant distance\n"); + DEBUG(dbgs() << "LV: Dependence because of non-constant distance\n"); ShouldRetryWithRuntimeCheck = true; return true; } @@ -3792,9 +4584,9 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, return false; } -bool -MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, - MemAccessInfoSet &CheckDeps) { +bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, + MemAccessInfoSet &CheckDeps, + ValueToValueMap &Strides) { MaxSafeDepDistBytes = -1U; while (!CheckDeps.empty()) { @@ -3811,16 +4603,16 @@ MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, // Check every access pair. while (AI != AE) { CheckDeps.erase(*AI); - EquivalenceClasses<MemAccessInfo>::member_iterator OI = llvm::next(AI); + EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI); while (OI != AE) { // Check every accessing instruction pair in program order. for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(), I1E = Accesses[*AI].end(); I1 != I1E; ++I1) for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(), I2E = Accesses[*OI].end(); I2 != I2E; ++I2) { - if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2)) + if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides)) return false; - if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1)) + if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides)) return false; } ++OI; @@ -3870,11 +4662,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() { continue; LoadInst *Ld = dyn_cast<LoadInst>(it); - if (!Ld) return false; - if (!Ld->isSimple() && !IsAnnotatedParallel) { + if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) { + emitAnalysis(Report(Ld) + << "read with atomic ordering or volatile read"); DEBUG(dbgs() << "LV: Found a non-simple load.\n"); return false; } + NumLoads++; Loads.push_back(Ld); DepChecker.addAccess(Ld); continue; @@ -3883,11 +4677,17 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // Save 'store' instructions. Abort if other instructions write to memory. if (it->mayWriteToMemory()) { StoreInst *St = dyn_cast<StoreInst>(it); - if (!St) return false; + if (!St) { + emitAnalysis(Report(it) << "instruction cannot be vectorized"); + return false; + } if (!St->isSimple() && !IsAnnotatedParallel) { + emitAnalysis(Report(St) + << "write with atomic ordering or volatile write"); DEBUG(dbgs() << "LV: Found a non-simple store.\n"); return false; } + NumStores++; Stores.push_back(St); DepChecker.addAccess(St); } @@ -3905,7 +4705,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } AccessAnalysis::DepCandidates DependentAccesses; - AccessAnalysis Accesses(DL, DependentAccesses); + AccessAnalysis Accesses(DL, AA, DependentAccesses); // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects // multiple times on the same object. If the ptr is accessed twice, once @@ -3920,6 +4720,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() { Value* Ptr = ST->getPointerOperand(); if (isUniform(Ptr)) { + emitAnalysis( + Report(ST) + << "write to a loop invariant address could not be vectorized"); DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); return false; } @@ -3928,7 +4731,15 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // list. At this phase it is only a 'write' list. if (Seen.insert(Ptr)) { ++NumReadWrites; - Accesses.addStore(Ptr); + + AliasAnalysis::Location Loc = AA->getLocation(ST); + // The TBAA metadata could have a control dependency on the predication + // condition, so we cannot rely on it when determining whether or not we + // need runtime pointer checks. + if (blockNeedsPredication(ST->getParent())) + Loc.TBAATag = nullptr; + + Accesses.addStore(Loc); } } @@ -3951,11 +4762,19 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // read a few words, modify, and write a few words, and some of the // words may be written to the same address. bool IsReadOnlyPtr = false; - if (Seen.insert(Ptr) || !isStridedPtr(SE, DL, Ptr, TheLoop)) { + if (Seen.insert(Ptr) || !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) { ++NumReads; IsReadOnlyPtr = true; } - Accesses.addLoad(Ptr, IsReadOnlyPtr); + + AliasAnalysis::Location Loc = AA->getLocation(LD); + // The TBAA metadata could have a control dependency on the predication + // condition, so we cannot rely on it when determining whether or not we + // need runtime pointer checks. + if (blockNeedsPredication(LD->getParent())) + Loc.TBAATag = nullptr; + + Accesses.addLoad(Loc, IsReadOnlyPtr); } // If we write (or read-write) to a single destination and there are no @@ -3975,8 +4794,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { unsigned NumComparisons = 0; bool CanDoRT = false; if (NeedRTCheck) - CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop); - + CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop, + Strides); DEBUG(dbgs() << "LV: We need to do " << NumComparisons << " pointer comparisons.\n"); @@ -3998,6 +4817,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } if (NeedRTCheck && !CanDoRT) { + emitAnalysis(Report() << "cannot identify array bounds"); DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << "the array bounds.\n"); PtrRtCheck.reset(); @@ -4009,8 +4829,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { bool CanVecMem = true; if (Accesses.isDependencyCheckNeeded()) { DEBUG(dbgs() << "LV: Checking memory dependencies\n"); - CanVecMem = DepChecker.areDepsSafe(DependentAccesses, - Accesses.getDependenciesToCheck()); + CanVecMem = DepChecker.areDepsSafe( + DependentAccesses, Accesses.getDependenciesToCheck(), Strides); MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) { @@ -4024,10 +4844,18 @@ bool LoopVectorizationLegality::canVectorizeMemory() { PtrRtCheck.Need = true; CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, - TheLoop, true); + TheLoop, Strides, true); // Check that we did not collect too many pointers or found an unsizeable // pointer. if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { + if (!CanDoRT && NumComparisons > 0) + emitAnalysis(Report() + << "cannot check memory dependencies at runtime"); + else + emitAnalysis(Report() + << NumComparisons << " exceeds limit of " + << RuntimeMemoryCheckThreshold + << " dependent memory operations checked at runtime"); DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n"); PtrRtCheck.reset(); return false; @@ -4037,6 +4865,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } } + if (!CanVecMem) + emitAnalysis(Report() << "unsafe dependent memory operations in loop"); + DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << " need a runtime memory check.\n"); @@ -4080,7 +4911,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // We only allow for a single reduction value to be used outside the loop. // This includes users of the reduction, variables (which form a cycle // which ends in the phi node). - Instruction *ExitInstruction = 0; + Instruction *ExitInstruction = nullptr; // Indicates that we found a reduction operation in our scan. bool FoundReduxOp = false; @@ -4094,7 +4925,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // the number of instruction we saw from the recognized min/max pattern, // to make sure we only see exactly the two instructions. unsigned NumCmpSelectPatternInst = 0; - ReductionInstDesc ReduxDesc(false, 0); + ReductionInstDesc ReduxDesc(false, nullptr); SmallPtrSet<Instruction *, 8> VisitedInsts; SmallVector<Instruction *, 8> Worklist; @@ -4162,23 +4993,22 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // Check whether we found a reduction operator. FoundReduxOp |= !IsAPhi; - // Process users of current instruction. Push non PHI nodes after PHI nodes + // Process users of current instruction. Push non-PHI nodes after PHI nodes // onto the stack. This way we are going to have seen all inputs to PHI // nodes once we get to them. SmallVector<Instruction *, 8> NonPHIs; SmallVector<Instruction *, 8> PHIs; - for (Value::use_iterator UI = Cur->use_begin(), E = Cur->use_end(); UI != E; - ++UI) { - Instruction *Usr = cast<Instruction>(*UI); + for (User *U : Cur->users()) { + Instruction *UI = cast<Instruction>(U); // Check if we found the exit user. - BasicBlock *Parent = Usr->getParent(); + BasicBlock *Parent = UI->getParent(); if (!TheLoop->contains(Parent)) { // Exit if you find multiple outside users or if the header phi node is // being used. In this case the user uses the value of the previous // iteration, in which case we would loose "VF-1" iterations of the // reduction operation if we vectorize. - if (ExitInstruction != 0 || Cur == Phi) + if (ExitInstruction != nullptr || Cur == Phi) return false; // The instruction used by an outside user must be the last instruction @@ -4194,21 +5024,21 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // Process instructions only once (termination). Each reduction cycle // value must only be used once, except by phi nodes and min/max // reductions which are represented as a cmp followed by a select. - ReductionInstDesc IgnoredVal(false, 0); - if (VisitedInsts.insert(Usr)) { - if (isa<PHINode>(Usr)) - PHIs.push_back(Usr); + ReductionInstDesc IgnoredVal(false, nullptr); + if (VisitedInsts.insert(UI)) { + if (isa<PHINode>(UI)) + PHIs.push_back(UI); else - NonPHIs.push_back(Usr); - } else if (!isa<PHINode>(Usr) && - ((!isa<FCmpInst>(Usr) && - !isa<ICmpInst>(Usr) && - !isa<SelectInst>(Usr)) || - !isMinMaxSelectCmpPattern(Usr, IgnoredVal).IsReduction)) + NonPHIs.push_back(UI); + } else if (!isa<PHINode>(UI) && + ((!isa<FCmpInst>(UI) && + !isa<ICmpInst>(UI) && + !isa<SelectInst>(UI)) || + !isMinMaxSelectCmpPattern(UI, IgnoredVal).IsReduction)) return false; // Remember that we completed the cycle. - if (Usr == Phi) + if (UI == Phi) FoundStartPHI = true; } Worklist.append(PHIs.begin(), PHIs.end()); @@ -4248,13 +5078,13 @@ LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) && "Expect a select instruction"); - Instruction *Cmp = 0; - SelectInst *Select = 0; + Instruction *Cmp = nullptr; + SelectInst *Select = nullptr; // We must handle the select(cmp()) as a single instruction. Advance to the // select. if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) { - if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin()))) + if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->user_begin()))) return ReductionInstDesc(false, I); return ReductionInstDesc(Select, Prev.MinMaxKind); } @@ -4399,7 +5229,16 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, } // We don't predicate stores at the moment. - if (it->mayWriteToMemory() || it->mayThrow()) + if (it->mayWriteToMemory()) { + StoreInst *SI = dyn_cast<StoreInst>(it); + // We only support predication of stores in basic blocks with one + // predecessor. + if (!SI || ++NumPredStores > NumberOfStoresToPredicate || + !SafePtrs.count(SI->getPointerOperand()) || + !SI->getParent()->getSinglePredecessor()) + return false; + } + if (it->mayThrow()) return false; // Check that we don't have a constant expression that can trap as operand. @@ -4426,7 +5265,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, LoopVectorizationCostModel::VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, - unsigned UserVF) { + unsigned UserVF, + bool ForceVectorization) { // Width 1 means no vectorize VectorizationFactor Factor = { 1U, 0U }; if (OptForSize && Legal->getRuntimePointerCheck()->Need) { @@ -4434,6 +5274,11 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, return Factor; } + if (!EnableCondStoresVectorization && Legal->NumPredStores) { + DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); + return Factor; + } + // Find the trip count. unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); @@ -4491,8 +5336,18 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, } float Cost = expectedCost(1); +#ifndef NDEBUG + const float ScalarCost = Cost; +#endif /* NDEBUG */ unsigned Width = 1; - DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n"); + DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); + + // Ignore scalar width, because the user explicitly wants vectorization. + if (ForceVectorization && VF > 1) { + Width = 2; + Cost = expectedCost(Width) / (float)Width; + } + for (unsigned i=2; i <= VF; i*=2) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of @@ -4506,7 +5361,10 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, } } - DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); + DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() + << "LV: Vectorization seems to be not beneficial, " + << "but was forced by a user.\n"); + DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n"); Factor.Width = Width; Factor.Cost = Width * Cost; return Factor; @@ -4589,9 +5447,17 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, if (TC > 1 && TC < TinyTripCountUnrollThreshold) return 1; - unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true); - DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters << - " vector registers\n"); + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); + DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters << + " registers\n"); + + if (VF == 1) { + if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) + TargetNumRegisters = ForceTargetNumScalarRegs; + } else { + if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) + TargetNumRegisters = ForceTargetNumVectorRegs; + } LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); // We divide by these constants so assume that we have at least one @@ -4604,12 +5470,29 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, // registers. These registers are used by all of the unrolled instances. // Next, divide the remaining registers by the number of registers that is // required by the loop, in order to estimate how many parallel instances - // fit without causing spills. - unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers; + // fit without causing spills. All of this is rounded down if necessary to be + // a power of two. We want power of two unroll factors to simplify any + // addressing operations or alignment considerations. + unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / + R.MaxLocalUsers); + + // Don't count the induction variable as unrolled. + if (EnableIndVarRegisterHeur) + UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / + std::max(1U, (R.MaxLocalUsers - 1))); // Clamp the unroll factor ranges to reasonable factors. unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor(); + // Check if the user has overridden the unroll max. + if (VF == 1) { + if (ForceTargetMaxScalarUnrollFactor.getNumOccurrences() > 0) + MaxUnrollSize = ForceTargetMaxScalarUnrollFactor; + } else { + if (ForceTargetMaxVectorUnrollFactor.getNumOccurrences() > 0) + MaxUnrollSize = ForceTargetMaxVectorUnrollFactor; + } + // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) @@ -4622,32 +5505,40 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, else if (UF < 1) UF = 1; - bool HasReductions = Legal->getReductionVars()->size(); - - // Decide if we want to unroll if we decided that it is legal to vectorize - // but not profitable. - if (VF == 1) { - if (TheLoop->getNumBlocks() > 1 || !HasReductions || - LoopCost > SmallLoopCost) - return 1; - - return UF; - } - - if (HasReductions) { + // Unroll if we vectorized this loop and there is a reduction that could + // benefit from unrolling. + if (VF > 1 && Legal->getReductionVars()->size()) { DEBUG(dbgs() << "LV: Unrolling because of reductions.\n"); return UF; } - // We want to unroll tiny loops in order to reduce the loop overhead. - // We assume that the cost overhead is 1 and we use the cost model - // to estimate the cost of the loop and unroll until the cost of the - // loop overhead is about 5% of the cost of the loop. + // Note that if we've already vectorized the loop we will have done the + // runtime check and so unrolling won't require further checks. + bool UnrollingRequiresRuntimePointerCheck = + (VF == 1 && Legal->getRuntimePointerCheck()->Need); + + // We want to unroll small loops in order to reduce the loop overhead and + // potentially expose ILP opportunities. DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); - if (LoopCost < SmallLoopCost) { + if (!UnrollingRequiresRuntimePointerCheck && + LoopCost < SmallLoopCost) { + // We assume that the cost overhead is 1 and we use the cost model + // to estimate the cost of the loop and unroll until the cost of the + // loop overhead is about 5% of the cost of the loop. + unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); + + // Unroll until store/load ports (estimated by max unroll factor) are + // saturated. + unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1); + unsigned LoadsUF = UF / (Legal->NumLoads ? Legal->NumLoads : 1); + + if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) { + DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n"); + return std::max(StoresUF, LoadsUF); + } + DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n"); - unsigned NewUF = SmallLoopCost / (LoopCost + 1); - return std::min(NewUF, UF); + return SmallUF; } DEBUG(dbgs() << "LV: Not Unrolling.\n"); @@ -4783,6 +5674,11 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { continue; unsigned C = getInstructionCost(it, VF); + + // Check if we should override the cost. + if (ForceTargetInstructionCost.getNumOccurrences() > 0) + C = ForceTargetInstructionCost; + BlockCost += C; DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " << VF << " For instruction: " << *it << '\n'); @@ -4853,6 +5749,12 @@ static bool isLikelyComplexAddressComputation(Value *Ptr, return StepVal > MaxMergeDistance; } +static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { + if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1))) + return true; + return false; +} + unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // If we know that this instruction will remain uniform, check the cost of @@ -4895,15 +5797,25 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { + // Since we will replace the stride by 1 the multiplication should go away. + if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) + return 0; // Certain instructions can be cheaper to vectorize if they have a constant // second vector operand. One example of this are shifts on x86. TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_AnyValue; + Value *Op2 = I->getOperand(1); - if (isa<ConstantInt>(I->getOperand(1))) + // Check for a splat of a constant or for a non uniform vector of constants. + if (isa<ConstantInt>(Op2)) Op2VK = TargetTransformInfo::OK_UniformConstantValue; + else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) { + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + if (cast<Constant>(Op2)->getSplatValue() != nullptr) + Op2VK = TargetTransformInfo::OK_UniformConstantValue; + } return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK); } @@ -5047,7 +5959,9 @@ char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(LoopInfo) @@ -5055,8 +5969,8 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { - Pass *createLoopVectorizePass(bool NoUnrolling) { - return new LoopVectorize(NoUnrolling); + Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) { + return new LoopVectorize(NoUnrolling, AlwaysVectorize); } } @@ -5073,7 +5987,8 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { } -void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) { +void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, + bool IfPredicateStore) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector<VectorParts, 4> Params; @@ -5113,15 +6028,45 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) { // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - Value *UndefVec = IsVoidRetTy ? 0 : + Value *UndefVec = IsVoidRetTy ? nullptr : UndefValue::get(Instr->getType()); // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + Instruction *InsertPt = Builder.GetInsertPoint(); + BasicBlock *IfBlock = Builder.GetInsertBlock(); + BasicBlock *CondBlock = nullptr; + + VectorParts Cond; + Loop *VectorLp = nullptr; + if (IfPredicateStore) { + assert(Instr->getParent()->getSinglePredecessor() && + "Only support single predecessor blocks"); + Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), + Instr->getParent()); + VectorLp = LI->getLoopFor(IfBlock); + assert(VectorLp && "Must have a loop for this block"); + } + // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { // For each scalar that we create: + // Start an "if (pred) a[i] = ..." block. + Value *Cmp = nullptr; + if (IfPredicateStore) { + if (Cond[Part]->getType()->isVectorTy()) + Cond[Part] = + Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); + Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part], + ConstantInt::get(Cond[Part]->getType(), 1)); + CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); + LoopVectorBody.push_back(CondBlock); + VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); + // Update Builder with newly created basic block. + Builder.SetInsertPoint(InsertPt); + } + Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); @@ -5138,13 +6083,26 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) { // so that future users will be able to use it. if (!IsVoidRetTy) VecResults[Part] = Cloned; + + // End if-block. + if (IfPredicateStore) { + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + LoopVectorBody.push_back(NewIfBlock); + VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } } } -void -InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr, - LoopVectorizationLegality*) { - return scalarizeInstruction(Instr); +void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { + StoreInst *SI = dyn_cast<StoreInst>(Instr); + bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent())); + + return scalarizeInstruction(Instr, IfPredicateStore); } Value *InnerLoopUnroller::reverseVector(Value *Vec) { @@ -5163,4 +6121,3 @@ Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx, Constant *C = ConstantInt::get(ITy, StartIdx, Negate); return Builder.CreateAdd(Val, C, "induction"); } - diff --git a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c72b51f..53a43d9 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15,36 +15,39 @@ // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. // //===----------------------------------------------------------------------===// -#define SV_NAME "slp-vectorizer" -#define DEBUG_TYPE "SLP" - #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Analysis/Verifier.h" -#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" +#include "llvm/IR/NoFolder.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/IR/Verifier.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/VectorUtils.h" #include <algorithm> #include <map> using namespace llvm; +#define SV_NAME "slp-vectorizer" +#define DEBUG_TYPE "SLP" + static cl::opt<int> SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " @@ -71,8 +74,6 @@ struct BlockNumbering { BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {} - BlockNumbering() : BB(0), Valid(false) {} - void numberInstructions() { unsigned Loc = 0; InstrIdx.clear(); @@ -119,15 +120,15 @@ private: static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { Instruction *I0 = dyn_cast<Instruction>(VL[0]); if (!I0) - return 0; + return nullptr; BasicBlock *BB = I0->getParent(); for (int i = 1, e = VL.size(); i < e; i++) { Instruction *I = dyn_cast<Instruction>(VL[i]); if (!I) - return 0; + return nullptr; if (BB != I->getParent()) - return 0; + return nullptr; } return BB; } @@ -148,6 +149,48 @@ static bool isSplat(ArrayRef<Value *> VL) { return true; } +///\returns Opcode that can be clubbed with \p Op to create an alternate +/// sequence which can later be merged as a ShuffleVector instruction. +static unsigned getAltOpcode(unsigned Op) { + switch (Op) { + case Instruction::FAdd: + return Instruction::FSub; + case Instruction::FSub: + return Instruction::FAdd; + case Instruction::Add: + return Instruction::Sub; + case Instruction::Sub: + return Instruction::Add; + default: + return 0; + } +} + +///\returns bool representing if Opcode \p Op can be part +/// of an alternate sequence which can later be merged as +/// a ShuffleVector instruction. +static bool canCombineAsAltInst(unsigned Op) { + if (Op == Instruction::FAdd || Op == Instruction::FSub || + Op == Instruction::Sub || Op == Instruction::Add) + return true; + return false; +} + +/// \returns ShuffleVector instruction if intructions in \p VL have +/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. +/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) +static unsigned isAltInst(ArrayRef<Value *> VL) { + Instruction *I0 = dyn_cast<Instruction>(VL[0]); + unsigned Opcode = I0->getOpcode(); + unsigned AltOpcode = getAltOpcode(Opcode); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode)) + return 0; + } + return Instruction::ShuffleVector; +} + /// \returns The opcode if all of the Instructions in \p VL have the same /// opcode, or zero. static unsigned getSameOpcode(ArrayRef<Value *> VL) { @@ -157,8 +200,11 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) { unsigned Opcode = I0->getOpcode(); for (int i = 1, e = VL.size(); i < e; i++) { Instruction *I = dyn_cast<Instruction>(VL[i]); - if (!I || Opcode != I->getOpcode()) + if (!I || Opcode != I->getOpcode()) { + if (canCombineAsAltInst(Opcode) && i == 1) + return isAltInst(VL); return 0; + } } return Opcode; } @@ -179,7 +225,7 @@ static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { switch (Kind) { default: - MD = 0; // Remove unknown metadata + MD = nullptr; // Remove unknown metadata break; case LLVMContext::MD_tbaa: MD = MDNode::getMostGenericTBAA(MD, IMD); @@ -200,7 +246,7 @@ static Type* getSameType(ArrayRef<Value *> VL) { Type *Ty = VL[0]->getType(); for (int i = 1, e = VL.size(); i < e; i++) if (VL[i]->getType() != Ty) - return 0; + return nullptr; return Ty; } @@ -343,18 +389,11 @@ public: typedef SmallPtrSet<Value *, 16> ValueSet; typedef SmallVector<StoreInst *, 8> StoreList; - BoUpSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl, - TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li, - DominatorTree *Dt) : - F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), DT(Dt), - Builder(Se->getContext()) { - // Setup the block numbering utility for all of the blocks in the - // function. - for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { - BasicBlock *BB = it; - BlocksNumbers[BB] = BlockNumbering(BB); - } - } + BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl, + TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, + LoopInfo *Li, DominatorTree *Dt) + : F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), + Builder(Se->getContext()) {} /// \brief Vectorize the tree that starts with the elements in \p VL. /// Returns the vectorized root. @@ -364,13 +403,13 @@ public: /// A negative number means that this is profitable. int getTreeCost(); - /// Construct a vectorizable tree that starts at \p Roots and is possibly - /// used by a reduction of \p RdxOps. - void buildTree(ArrayRef<Value *> Roots, ValueSet *RdxOps = 0); + /// Construct a vectorizable tree that starts at \p Roots, ignoring users for + /// the purpose of scheduling and extraction in the \p UserIgnoreLst. + void buildTree(ArrayRef<Value *> Roots, + ArrayRef<Value *> UserIgnoreLst = None); /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { - RdxOps = 0; VectorizableTree.clear(); ScalarToTreeEntry.clear(); MustGather.clear(); @@ -383,6 +422,7 @@ public: /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); + private: struct TreeEntry; @@ -442,10 +482,10 @@ private: /// \returns whether the VectorizableTree is fully vectoriable and will /// be beneficial even the tree height is tiny. - bool isFullyVectorizableTinyTree(); + bool isFullyVectorizableTinyTree(); struct TreeEntry { - TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0), + TreeEntry() : Scalars(), VectorizedValue(nullptr), LastScalarIndex(0), NeedToGather(0) {} /// \returns true if the scalars in VL are equal to this entry. @@ -521,19 +561,27 @@ private: /// Holds all of the instructions that we gathered. SetVector<Instruction *> GatherSeq; /// A list of blocks that we are going to CSE. - SmallSet<BasicBlock *, 8> CSEBlocks; + SetVector<BasicBlock *> CSEBlocks; /// Numbers instructions in different blocks. DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers; - /// Reduction operators. - ValueSet *RdxOps; + /// \brief Get the corresponding instruction numbering list for a given + /// BasicBlock. The list is allocated lazily. + BlockNumbering &getBlockNumbering(BasicBlock *BB) { + auto I = BlocksNumbers.insert(std::make_pair(BB, BlockNumbering(BB))); + return I.first->second; + } + + /// List of users to ignore during scheduling and that don't need extracting. + ArrayRef<Value *> UserIgnoreList; // Analysis and block reference. Function *F; ScalarEvolution *SE; - DataLayout *DL; + const DataLayout *DL; TargetTransformInfo *TTI; + TargetLibraryInfo *TLI; AliasAnalysis *AA; LoopInfo *LI; DominatorTree *DT; @@ -541,9 +589,10 @@ private: IRBuilder<> Builder; }; -void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) { +void BoUpSLP::buildTree(ArrayRef<Value *> Roots, + ArrayRef<Value *> UserIgnoreLst) { deleteTree(); - RdxOps = Rdx; + UserIgnoreList = UserIgnoreLst; if (!getSameType(Roots)) return; buildTree_rec(Roots, 0); @@ -560,29 +609,29 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) { if (Entry->NeedToGather) continue; - for (Value::use_iterator User = Scalar->use_begin(), - UE = Scalar->use_end(); User != UE; ++User) { - DEBUG(dbgs() << "SLP: Checking user:" << **User << ".\n"); + for (User *U : Scalar->users()) { + DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); // Skip in-tree scalars that become vectors. - if (ScalarToTreeEntry.count(*User)) { + if (ScalarToTreeEntry.count(U)) { DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << - **User << ".\n"); - int Idx = ScalarToTreeEntry[*User]; (void) Idx; + *U << ".\n"); + int Idx = ScalarToTreeEntry[U]; (void) Idx; assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); continue; } - Instruction *UserInst = dyn_cast<Instruction>(*User); + Instruction *UserInst = dyn_cast<Instruction>(U); if (!UserInst) continue; - // Ignore uses that are part of the reduction. - if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end()) + // Ignore users in the user ignore list. + if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) != + UserIgnoreList.end()) continue; - DEBUG(dbgs() << "SLP: Need to extract:" << **User << " from lane " << + DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << Lane << " from " << *Scalar << ".\n"); - ExternalUses.push_back(ExternalUser(Scalar, *User, Lane)); + ExternalUses.push_back(ExternalUser(Scalar, U, Lane)); } } } @@ -591,6 +640,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) { void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { bool SameTy = getSameType(VL); (void)SameTy; + bool isAltShuffle = false; assert(SameTy && "Invalid types!"); if (Depth == RecursionMaxDepth) { @@ -612,10 +662,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { newTreeEntry(VL, false); return; } + unsigned Opcode = getSameOpcode(VL); + + // Check that this shuffle vector refers to the alternate + // sequence of opcodes. + if (Opcode == Instruction::ShuffleVector) { + Instruction *I0 = dyn_cast<Instruction>(VL[0]); + unsigned Op = I0->getOpcode(); + if (Op != Instruction::ShuffleVector) + isAltShuffle = true; + } // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || - !getSameOpcode(VL)) { + if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); newTreeEntry(VL, false); return; @@ -669,57 +728,57 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (unsigned i = 0, e = VL.size(); i != e; ++i) { Instruction *Scalar = cast<Instruction>(VL[i]); DEBUG(dbgs() << "SLP: Checking users of " << *Scalar << ". \n"); - for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end(); - U != UE; ++U) { - DEBUG(dbgs() << "SLP: \tUser " << **U << ". \n"); - Instruction *User = dyn_cast<Instruction>(*U); - if (!User) { + for (User *U : Scalar->users()) { + DEBUG(dbgs() << "SLP: \tUser " << *U << ". \n"); + Instruction *UI = dyn_cast<Instruction>(U); + if (!UI) { DEBUG(dbgs() << "SLP: Gathering due unknown user. \n"); newTreeEntry(VL, false); return; } // We don't care if the user is in a different basic block. - BasicBlock *UserBlock = User->getParent(); + BasicBlock *UserBlock = UI->getParent(); if (UserBlock != BB) { DEBUG(dbgs() << "SLP: User from a different basic block " - << *User << ". \n"); + << *UI << ". \n"); continue; } // If this is a PHINode within this basic block then we can place the // extract wherever we want. - if (isa<PHINode>(*User)) { - DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *User << ". \n"); + if (isa<PHINode>(*UI)) { + DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *UI << ". \n"); continue; } // Check if this is a safe in-tree user. - if (ScalarToTreeEntry.count(User)) { - int Idx = ScalarToTreeEntry[User]; + if (ScalarToTreeEntry.count(UI)) { + int Idx = ScalarToTreeEntry[UI]; int VecLocation = VectorizableTree[Idx].LastScalarIndex; if (VecLocation <= MyLastIndex) { DEBUG(dbgs() << "SLP: Gathering due to unschedulable vector. \n"); newTreeEntry(VL, false); return; } - DEBUG(dbgs() << "SLP: In-tree user (" << *User << ") at #" << + DEBUG(dbgs() << "SLP: In-tree user (" << *UI << ") at #" << VecLocation << " vector value (" << *Scalar << ") at #" << MyLastIndex << ".\n"); continue; } - // This user is part of the reduction. - if (RdxOps && RdxOps->count(User)) + // Ignore users in the user ignore list. + if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UI) != + UserIgnoreList.end()) continue; // Make sure that we can schedule this unknown user. - BlockNumbering &BN = BlocksNumbers[BB]; - int UserIndex = BN.getIndex(User); + BlockNumbering &BN = getBlockNumbering(BB); + int UserIndex = BN.getIndex(UI); if (UserIndex < MyLastIndex) { DEBUG(dbgs() << "SLP: Can't schedule extractelement for " - << *User << ". \n"); + << *UI << ". \n"); newTreeEntry(VL, false); return; } @@ -738,11 +797,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { // Check that instructions in this bundle don't reference other instructions. // The runtime of this check is O(N * N-1 * uses(N)) and a typical N is 4. for (unsigned i = 0, e = VL.size(); i < e; ++i) { - for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); - U != UE; ++U) { + for (User *U : VL[i]->users()) { for (unsigned j = 0; j < e; ++j) { - if (i != j && *U == VL[j]) { - DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << **U << ". \n"); + if (i != j && U == VL[j]) { + DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << *U << ". \n"); newTreeEntry(VL, false); return; } @@ -752,8 +810,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); - unsigned Opcode = getSameOpcode(VL); - // Check if it is safe to sink the loads or the stores. if (Opcode == Instruction::Load || Opcode == Instruction::Store) { Instruction *Last = getLastInstruction(VL); @@ -778,7 +834,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { // Check for terminator values (e.g. invoke). for (unsigned j = 0; j < VL.size(); ++j) for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { - TerminatorInst *Term = dyn_cast<TerminatorInst>(cast<PHINode>(VL[j])->getIncomingValue(i)); + TerminatorInst *Term = dyn_cast<TerminatorInst>( + cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i))); if (Term) { DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); newTreeEntry(VL, false); @@ -793,7 +850,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { ValueList Operands; // Prepare the operand vector. for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i)); + Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock( + PH->getIncomingBlock(i))); buildTree_rec(Operands, Depth + 1); } @@ -910,8 +968,20 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { ValueList Left, Right; reorderInputsAccordingToOpcode(VL, Left, Right); - buildTree_rec(Left, Depth + 1); - buildTree_rec(Right, Depth + 1); + BasicBlock *LeftBB = getSameBlock(Left); + BasicBlock *RightBB = getSameBlock(Right); + // If we have common uses on separate paths in the tree make sure we + // process the one with greater common depth first. + // We can use block numbering to determine the subtree traversal as + // earler user has to come in between the common use and the later user. + if (LeftBB && RightBB && LeftBB == RightBB && + getLastIndex(Right) > getLastIndex(Left)) { + buildTree_rec(Right, Depth + 1); + buildTree_rec(Left, Depth + 1); + } else { + buildTree_rec(Left, Depth + 1); + buildTree_rec(Right, Depth + 1); + } return; } @@ -925,12 +995,57 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } return; } + case Instruction::GetElementPtr: { + // We don't combine GEPs with complicated (nested) indexing. + for (unsigned j = 0; j < VL.size(); ++j) { + if (cast<Instruction>(VL[j])->getNumOperands() != 2) { + DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); + newTreeEntry(VL, false); + return; + } + } + + // We can't combine several GEPs into one vector if they operate on + // different types. + Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType(); + for (unsigned j = 0; j < VL.size(); ++j) { + Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType(); + if (Ty0 != CurTy) { + DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); + newTreeEntry(VL, false); + return; + } + } + + // We don't combine GEPs with non-constant indexes. + for (unsigned j = 0; j < VL.size(); ++j) { + auto Op = cast<Instruction>(VL[j])->getOperand(1); + if (!isa<ConstantInt>(Op)) { + DEBUG( + dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); + newTreeEntry(VL, false); + return; + } + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); + for (unsigned i = 0, e = 2; i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth + 1); + } + return; + } case Instruction::Store: { // Check if the stores are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1])) { newTreeEntry(VL, false); - DEBUG(dbgs() << "SLP: Non consecutive store.\n"); + DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } @@ -946,6 +1061,76 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { buildTree_rec(Operands, Depth + 1); return; } + case Instruction::Call: { + // Check if the calls are all to the same vectorizable intrinsic. + CallInst *CI = cast<CallInst>(VL[0]); + // Check if this is an Intrinsic call or something that can be + // represented by an intrinsic call + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); + if (!isTriviallyVectorizable(ID)) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); + return; + } + Function *Int = CI->getCalledFunction(); + Value *A1I = nullptr; + if (hasVectorInstrinsicScalarOpd(ID, 1)) + A1I = CI->getArgOperand(1); + for (unsigned i = 1, e = VL.size(); i != e; ++i) { + CallInst *CI2 = dyn_cast<CallInst>(VL[i]); + if (!CI2 || CI2->getCalledFunction() != Int || + getIntrinsicIDForCall(CI2, TLI) != ID) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] + << "\n"); + return; + } + // ctlz,cttz and powi are special intrinsics whose second argument + // should be same in order for them to be vectorized. + if (hasVectorInstrinsicScalarOpd(ID, 1)) { + Value *A1J = CI2->getArgOperand(1); + if (A1I != A1J) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI + << " argument "<< A1I<<"!=" << A1J + << "\n"); + return; + } + } + } + + newTreeEntry(VL, true); + for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) { + CallInst *CI2 = dyn_cast<CallInst>(VL[j]); + Operands.push_back(CI2->getArgOperand(i)); + } + buildTree_rec(Operands, Depth + 1); + } + return; + } + case Instruction::ShuffleVector: { + // If this is not an alternate sequence of opcode like add-sub + // then do not vectorize this instruction. + if (!isAltShuffle) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); + return; + } + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth + 1); + } + return; + } default: newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); @@ -969,18 +1154,25 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } return getGatherCost(E->Scalars); } - - assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) && - "Invalid VL"); + unsigned Opcode = getSameOpcode(VL); + assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL"); Instruction *VL0 = cast<Instruction>(VL[0]); - unsigned Opcode = VL0->getOpcode(); switch (Opcode) { case Instruction::PHI: { return 0; } case Instruction::ExtractElement: { - if (CanReuseExtract(VL)) - return 0; + if (CanReuseExtract(VL)) { + int DeadCost = 0; + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); + if (E->hasOneUse()) + // Take credit for instruction that will become dead. + DeadCost += + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); + } + return -DeadCost; + } return getGatherCost(VecTy); } case Instruction::ZExt: @@ -1043,12 +1235,26 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_UniformConstantValue; - // Check whether all second operands are constant. - for (unsigned i = 0; i < VL.size(); ++i) - if (!isa<ConstantInt>(cast<Instruction>(VL[i])->getOperand(1))) { + // If all operands are exactly the same ConstantInt then set the + // operand kind to OK_UniformConstantValue. + // If instead not all operands are constants, then set the operand kind + // to OK_AnyValue. If all operands are constants but not the same, + // then set the operand kind to OK_NonUniformConstantValue. + ConstantInt *CInt = nullptr; + for (unsigned i = 0; i < VL.size(); ++i) { + const Instruction *I = cast<Instruction>(VL[i]); + if (!isa<ConstantInt>(I->getOperand(1))) { Op2VK = TargetTransformInfo::OK_AnyValue; break; } + if (i == 0) { + CInt = cast<ConstantInt>(I->getOperand(1)); + continue; + } + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && + CInt != cast<ConstantInt>(I->getOperand(1))) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + } ScalarCost = VecTy->getNumElements() * @@ -1057,6 +1263,20 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } return VecCost - ScalarCost; } + case Instruction::GetElementPtr: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_UniformConstantValue; + + int ScalarCost = + VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); + int VecCost = + TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); + + return VecCost - ScalarCost; + } case Instruction::Load: { // Cost of wide load - cost of scalar loads. int ScalarLdCost = VecTy->getNumElements() * @@ -1071,6 +1291,55 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0); return VecStCost - ScalarStCost; } + case Instruction::Call: { + CallInst *CI = cast<CallInst>(VL0); + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); + + // Calculate the cost of the scalar and vector calls. + SmallVector<Type*, 4> ScalarTys, VecTys; + for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { + ScalarTys.push_back(CI->getArgOperand(op)->getType()); + VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), + VecTy->getNumElements())); + } + + int ScalarCallCost = VecTy->getNumElements() * + TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys); + + int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys); + + DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost + << " (" << VecCallCost << "-" << ScalarCallCost << ")" + << " for " << *CI << "\n"); + + return VecCallCost - ScalarCallCost; + } + case Instruction::ShuffleVector: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_AnyValue; + int ScalarCost = 0; + int VecCost = 0; + for (unsigned i = 0; i < VL.size(); ++i) { + Instruction *I = cast<Instruction>(VL[i]); + if (!I) + break; + ScalarCost += + TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); + } + // VecCost is equal to sum of the cost of creating 2 vectors + // and the cost of creating shuffle. + Instruction *I0 = cast<Instruction>(VL[0]); + VecCost = + TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); + Instruction *I1 = cast<Instruction>(VL[1]); + VecCost += + TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); + VecCost += + TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); + return VecCost - ScalarCost; + } default: llvm_unreachable("Unknown instruction"); } @@ -1084,11 +1353,15 @@ bool BoUpSLP::isFullyVectorizableTinyTree() { if (VectorizableTree.size() != 2) return false; + // Handle splat stores. + if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars)) + return true; + // Gathering cost would be too much for tiny trees. - if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) - return false; + if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) + return false; - return true; + return true; } int BoUpSLP::getTreeCost() { @@ -1113,16 +1386,19 @@ int BoUpSLP::getTreeCost() { Cost += C; } + SmallSet<Value *, 16> ExtractCostCalculated; int ExtractCost = 0; for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end(); I != E; ++I) { + // We only add extract cost once for the same scalar. + if (!ExtractCostCalculated.insert(I->Scalar)) + continue; VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth); ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I->Lane); } - DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n"); return Cost + ExtractCost; } @@ -1157,7 +1433,7 @@ Value *BoUpSLP::getPointerOperand(Value *I) { return LI->getPointerOperand(); if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand(); - return 0; + return nullptr; } unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { @@ -1231,13 +1507,13 @@ Value *BoUpSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { if (!A.Ptr || !B.Ptr || AA->alias(A, B)) return I; } - return 0; + return nullptr; } int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) { BasicBlock *BB = cast<Instruction>(VL[0])->getParent(); - assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); - BlockNumbering &BN = BlocksNumbers[BB]; + assert(BB == getSameBlock(VL) && "Invalid block"); + BlockNumbering &BN = getBlockNumbering(BB); int MaxIdx = BN.getIndex(BB->getFirstNonPHI()); for (unsigned i = 0, e = VL.size(); i < e; ++i) @@ -1247,8 +1523,8 @@ int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) { Instruction *BoUpSLP::getLastInstruction(ArrayRef<Value *> VL) { BasicBlock *BB = cast<Instruction>(VL[0])->getParent(); - assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); - BlockNumbering &BN = BlocksNumbers[BB]; + assert(BB == getSameBlock(VL) && "Invalid block"); + BlockNumbering &BN = getBlockNumbering(BB); int MaxIdx = BN.getIndex(cast<Instruction>(VL[0])); for (unsigned i = 1, e = VL.size(); i < e; ++i) @@ -1307,7 +1583,7 @@ Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const { if (En->isSame(VL) && En->VectorizedValue) return En->VectorizedValue; } - return 0; + return nullptr; } Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { @@ -1344,9 +1620,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { setInsertPointAfterBundle(E->Scalars); return Gather(E->Scalars, VecTy); } - - unsigned Opcode = VL0->getOpcode(); - assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode"); + unsigned Opcode = getSameOpcode(E->Scalars); switch (Opcode) { case Instruction::PHI: { @@ -1528,6 +1802,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { VecTy->getPointerTo(AS)); unsigned Alignment = LI->getAlignment(); LI = Builder.CreateLoad(VecPtr); + if (!Alignment) + Alignment = DL->getABITypeAlignment(LI->getPointerOperand()->getType()); LI->setAlignment(Alignment); E->VectorizedValue = LI; return propagateMetadata(LI, E->Scalars); @@ -1547,14 +1823,123 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo(AS)); StoreInst *S = Builder.CreateStore(VecValue, VecPtr); + if (!Alignment) + Alignment = DL->getABITypeAlignment(SI->getPointerOperand()->getType()); S->setAlignment(Alignment); E->VectorizedValue = S; return propagateMetadata(S, E->Scalars); } + case Instruction::GetElementPtr: { + setInsertPointAfterBundle(E->Scalars); + + ValueList Op0VL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0)); + + Value *Op0 = vectorizeTree(Op0VL); + + std::vector<Value *> OpVecs; + for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; + ++j) { + ValueList OpVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j)); + + Value *OpVec = vectorizeTree(OpVL); + OpVecs.push_back(OpVec); + } + + Value *V = Builder.CreateGEP(Op0, OpVecs); + E->VectorizedValue = V; + + if (Instruction *I = dyn_cast<Instruction>(V)) + return propagateMetadata(I, E->Scalars); + + return V; + } + case Instruction::Call: { + CallInst *CI = cast<CallInst>(VL0); + setInsertPointAfterBundle(E->Scalars); + Function *FI; + Intrinsic::ID IID = Intrinsic::not_intrinsic; + if (CI && (FI = CI->getCalledFunction())) { + IID = (Intrinsic::ID) FI->getIntrinsicID(); + } + std::vector<Value *> OpVecs; + for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { + ValueList OpVL; + // ctlz,cttz and powi are special intrinsics whose second argument is + // a scalar. This argument should not be vectorized. + if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { + CallInst *CEI = cast<CallInst>(E->Scalars[0]); + OpVecs.push_back(CEI->getArgOperand(j)); + continue; + } + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + CallInst *CEI = cast<CallInst>(E->Scalars[i]); + OpVL.push_back(CEI->getArgOperand(j)); + } + + Value *OpVec = vectorizeTree(OpVL); + DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); + OpVecs.push_back(OpVec); + } + + Module *M = F->getParent(); + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); + Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) }; + Function *CF = Intrinsic::getDeclaration(M, ID, Tys); + Value *V = Builder.CreateCall(CF, OpVecs); + E->VectorizedValue = V; + return V; + } + case Instruction::ShuffleVector: { + ValueList LHSVL, RHSVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + } + setInsertPointAfterBundle(E->Scalars); + + Value *LHS = vectorizeTree(LHSVL); + Value *RHS = vectorizeTree(RHSVL); + + if (Value *V = alreadyVectorized(E->Scalars)) + return V; + + // Create a vector of LHS op1 RHS + BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0); + Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); + + // Create a vector of LHS op2 RHS + Instruction *VL1 = cast<Instruction>(E->Scalars[1]); + BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1); + Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); + + // Create appropriate shuffle to take alternative operations from + // the vector. + std::vector<Constant *> Mask(E->Scalars.size()); + unsigned e = E->Scalars.size(); + for (unsigned i = 0; i < e; ++i) { + if (i & 1) + Mask[i] = Builder.getInt32(e + i); + else + Mask[i] = Builder.getInt32(i); + } + + Value *ShuffleMask = ConstantVector::get(Mask); + + Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); + E->VectorizedValue = V; + if (Instruction *I = dyn_cast<Instruction>(V)) + return propagateMetadata(I, E->Scalars); + + return V; + } default: llvm_unreachable("unknown inst"); } - return 0; + return nullptr; } Value *BoUpSLP::vectorizeTree() { @@ -1571,8 +1956,8 @@ Value *BoUpSLP::vectorizeTree() { // Skip users that we already RAUW. This happens when one instruction // has multiple uses of the same value. - if (std::find(Scalar->use_begin(), Scalar->use_end(), User) == - Scalar->use_end()) + if (std::find(Scalar->user_begin(), Scalar->user_end(), User) == + Scalar->user_end()) continue; assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); @@ -1586,12 +1971,7 @@ Value *BoUpSLP::vectorizeTree() { Value *Lane = Builder.getInt32(it->Lane); // Generate extracts for out-of-tree users. // Find the insertion point for the extractelement lane. - if (PHINode *PN = dyn_cast<PHINode>(Vec)) { - Builder.SetInsertPoint(PN->getParent()->getFirstInsertionPt()); - Value *Ex = Builder.CreateExtractElement(Vec, Lane); - CSEBlocks.insert(PN->getParent()); - User->replaceUsesOfWith(Scalar, Ex); - } else if (isa<Instruction>(Vec)){ + if (isa<Instruction>(Vec)){ if (PHINode *PH = dyn_cast<PHINode>(User)) { for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { if (PH->getIncomingValue(i) == Scalar) { @@ -1624,7 +2004,6 @@ Value *BoUpSLP::vectorizeTree() { // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - // No need to handle users of gathered values. if (Entry->NeedToGather) continue; @@ -1633,15 +2012,17 @@ Value *BoUpSLP::vectorizeTree() { Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { - for (Value::use_iterator User = Scalar->use_begin(), - UE = Scalar->use_end(); User != UE; ++User) { - DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n"); - - assert((ScalarToTreeEntry.count(*User) || - // It is legal to replace the reduction users by undef. - (RdxOps && RdxOps->count(*User))) && +#ifndef NDEBUG + for (User *U : Scalar->users()) { + DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); + + assert((ScalarToTreeEntry.count(U) || + // It is legal to replace users in the ignorelist by undef. + (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) != + UserIgnoreList.end())) && "Replacing out-of-tree value with undef"); } +#endif Value *Undef = UndefValue::get(Ty); Scalar->replaceAllUsesWith(Undef); } @@ -1650,24 +2031,14 @@ Value *BoUpSLP::vectorizeTree() { } } - for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { - BlocksNumbers[it].forget(); - } + for (auto &BN : BlocksNumbers) + BN.second.forget(); + Builder.ClearInsertionPoint(); return VectorizableTree[0].VectorizedValue; } -class DTCmp { - const DominatorTree *DT; - -public: - DTCmp(const DominatorTree *DT) : DT(DT) {} - bool operator()(const BasicBlock *A, const BasicBlock *B) const { - return DT->properlyDominates(A, B); - } -}; - void BoUpSLP::optimizeGatherSequence() { DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() << " gather sequences instructions.\n"); @@ -1703,21 +2074,30 @@ void BoUpSLP::optimizeGatherSequence() { Insert->moveBefore(PreHeader->getTerminator()); } + // Make a list of all reachable blocks in our CSE queue. + SmallVector<const DomTreeNode *, 8> CSEWorkList; + CSEWorkList.reserve(CSEBlocks.size()); + for (BasicBlock *BB : CSEBlocks) + if (DomTreeNode *N = DT->getNode(BB)) { + assert(DT->isReachableFromEntry(N)); + CSEWorkList.push_back(N); + } + // Sort blocks by domination. This ensures we visit a block after all blocks // dominating it are visited. - SmallVector<BasicBlock *, 8> CSEWorkList(CSEBlocks.begin(), CSEBlocks.end()); - std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), DTCmp(DT)); + std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), + [this](const DomTreeNode *A, const DomTreeNode *B) { + return DT->properlyDominates(A, B); + }); // Perform O(N^2) search over the gather sequences and merge identical // instructions. TODO: We can further optimize this scan if we split the // instructions into different buckets based on the insert lane. SmallVector<Instruction *, 16> Visited; - for (SmallVectorImpl<BasicBlock *>::iterator I = CSEWorkList.begin(), - E = CSEWorkList.end(); - I != E; ++I) { - assert((I == CSEWorkList.begin() || !DT->dominates(*I, *llvm::prior(I))) && + for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { + assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && "Worklist not sorted properly!"); - BasicBlock *BB = *I; + BasicBlock *BB = (*I)->getBlock(); // For all instructions in blocks containing gather sequences: for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { Instruction *In = it++; @@ -1733,7 +2113,7 @@ void BoUpSLP::optimizeGatherSequence() { DT->dominates((*v)->getParent(), In->getParent())) { In->replaceAllUsesWith(*v); In->eraseFromParent(); - In = 0; + In = nullptr; break; } } @@ -1760,19 +2140,25 @@ struct SLPVectorizer : public FunctionPass { } ScalarEvolution *SE; - DataLayout *DL; + const DataLayout *DL; TargetTransformInfo *TTI; + TargetLibraryInfo *TLI; AliasAnalysis *AA; LoopInfo *LI; DominatorTree *DT; - virtual bool runOnFunction(Function &F) { + bool runOnFunction(Function &F) override { + if (skipOptnoneFunction(F)) + return false; + SE = &getAnalysis<ScalarEvolution>(); - DL = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; TTI = &getAnalysis<TargetTransformInfo>(); + TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); AA = &getAnalysis<AliasAnalysis>(); LI = &getAnalysis<LoopInfo>(); - DT = &getAnalysis<DominatorTree>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); StoreRefs.clear(); bool Changed = false; @@ -1793,15 +2179,14 @@ struct SLPVectorizer : public FunctionPass { DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); - // Use the bollom up slp vectorizer to construct chains that start with - // he store instructions. - BoUpSLP R(&F, SE, DL, TTI, AA, LI, DT); + // Use the bottom up slp vectorizer to construct chains that start with + // store instructions. + BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT); // Scan the blocks in the function in post order. for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), e = po_end(&F.getEntryBlock()); it != e; ++it) { BasicBlock *BB = *it; - // Vectorize trees that end at stores. if (unsigned count = collectStores(BB, R)) { (void)count; @@ -1821,15 +2206,15 @@ struct SLPVectorizer : public FunctionPass { return Changed; } - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { FunctionPass::getAnalysisUsage(AU); AU.addRequired<ScalarEvolution>(); AU.addRequired<AliasAnalysis>(); AU.addRequired<TargetTransformInfo>(); AU.addRequired<LoopInfo>(); - AU.addRequired<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<LoopInfo>(); - AU.addPreserved<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); } @@ -1845,8 +2230,11 @@ private: bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); /// \brief Try to vectorize a list of operands. + /// \@param BuildVector A list of users to ignore for the purpose of + /// scheduling and that don't need extracting. /// \returns true if a value was vectorized. - bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R); + bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, + ArrayRef<Value *> BuildVector = None); /// \brief Try to vectorize a chain that may start at the operands of \V; bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); @@ -1867,7 +2255,7 @@ private: StoreListMap StoreRefs; }; -/// \brief Check that the Values in the slice in VL array are still existant in +/// \brief Check that the Values in the slice in VL array are still existent in /// the WeakVH array. /// Vectorization of part of the VL array may cause later values in the VL array /// to become invalid. We track when this has happened in the WeakVH array. @@ -1894,7 +2282,7 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, if (!isPowerOf2_32(Sz) || VF < 2) return false; - // Keep track of values that were delete by vectorizing in the loop below. + // Keep track of values that were deleted by vectorizing in the loop below. SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end()); bool Changed = false; @@ -2000,7 +2388,7 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { // Check that the pointer points to scalars. Type *Ty = SI->getValueOperand()->getType(); if (Ty->isAggregateType() || Ty->isVectorTy()) - return 0; + continue; // Find the base pointer. Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL); @@ -2019,7 +2407,8 @@ bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { return tryToVectorizeList(VL, R); } -bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) { +bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, + ArrayRef<Value *> BuildVector) { if (VL.size() < 2) return false; @@ -2047,7 +2436,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) { bool Changed = false; - // Keep track of values that were delete by vectorizing in the loop below. + // Keep track of values that were deleted by vectorizing in the loop below. SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end()); for (unsigned i = 0, e = VL.size(); i < e; ++i) { @@ -2069,13 +2458,38 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) { << "\n"); ArrayRef<Value *> Ops = VL.slice(i, OpsWidth); - R.buildTree(Ops); + ArrayRef<Value *> BuildVectorSlice; + if (!BuildVector.empty()) + BuildVectorSlice = BuildVector.slice(i, OpsWidth); + + R.buildTree(Ops, BuildVectorSlice); int Cost = R.getTreeCost(); if (Cost < -SLPCostThreshold) { - DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n"); - R.vectorizeTree(); - + DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); + Value *VectorizedRoot = R.vectorizeTree(); + + // Reconstruct the build vector by extracting the vectorized root. This + // way we handle the case where some elements of the vector are undefined. + // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) + if (!BuildVectorSlice.empty()) { + // The insert point is the last build vector instruction. The vectorized + // root will precede it. This guarantees that we get an instruction. The + // vectorized tree could have been constant folded. + Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back()); + unsigned VecIdx = 0; + for (auto &V : BuildVectorSlice) { + IRBuilder<true, NoFolder> Builder( + ++BasicBlock::iterator(InsertAfter)); + InsertElementInst *IE = cast<InsertElementInst>(V); + Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement( + VectorizedRoot, Builder.getInt32(VecIdx++))); + IE->setOperand(1, Extract); + IE->removeFromParent(); + IE->insertAfter(Extract); + InsertAfter = IE; + } + } // Move to the next bundle. i += VF - 1; Changed = true; @@ -2184,7 +2598,7 @@ static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, /// *p = /// class HorizontalReduction { - SmallPtrSet<Value *, 16> ReductionOps; + SmallVector<Value *, 16> ReductionOps; SmallVector<Value *, 32> ReducedVals; BinaryOperator *ReductionRoot; @@ -2202,12 +2616,12 @@ class HorizontalReduction { public: HorizontalReduction() - : ReductionRoot(0), ReductionPHI(0), ReductionOpcode(0), + : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} /// \brief Try to find a reduction tree. bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, - DataLayout *DL) { + const DataLayout *DL) { assert((!Phi || std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && "Thi phi needs to use the binary operator"); @@ -2217,10 +2631,10 @@ public: // In such a case start looking for a tree rooted in the first '+'. if (Phi) { if (B->getOperand(0) == Phi) { - Phi = 0; + Phi = nullptr; B = dyn_cast<BinaryOperator>(B->getOperand(1)); } else if (B->getOperand(1) == Phi) { - Phi = 0; + Phi = nullptr; B = dyn_cast<BinaryOperator>(B->getOperand(0)); } } @@ -2278,7 +2692,7 @@ public: // We need to be able to reassociate the adds. if (!TreeN->isAssociative()) return false; - ReductionOps.insert(TreeN); + ReductionOps.push_back(TreeN); } // Retract. Stack.pop_back(); @@ -2306,7 +2720,7 @@ public: if (NumReducedVals < ReduxWidth) return false; - Value *VectorizedTree = 0; + Value *VectorizedTree = nullptr; IRBuilder<> Builder(ReductionRoot); FastMathFlags Unsafe; Unsafe.setUnsafeAlgebra(); @@ -2315,7 +2729,7 @@ public: for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth); - V.buildTree(ValsToReduce, &ReductionOps); + V.buildTree(ValsToReduce, ReductionOps); // Estimate cost. int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); @@ -2349,13 +2763,13 @@ public: } // Update users. if (ReductionPHI) { - assert(ReductionRoot != NULL && "Need a reduction operation"); + assert(ReductionRoot && "Need a reduction operation"); ReductionRoot->setOperand(0, VectorizedTree); ReductionRoot->setOperand(1, ReductionPHI); } else ReductionRoot->replaceAllUsesWith(VectorizedTree); } - return VectorizedTree != 0; + return VectorizedTree != nullptr; } private: @@ -2434,18 +2848,21 @@ private: /// /// Returns true if it matches /// -static bool findBuildVector(InsertElementInst *IE, - SmallVectorImpl<Value *> &Ops) { - if (!isa<UndefValue>(IE->getOperand(0))) +static bool findBuildVector(InsertElementInst *FirstInsertElem, + SmallVectorImpl<Value *> &BuildVector, + SmallVectorImpl<Value *> &BuildVectorOpds) { + if (!isa<UndefValue>(FirstInsertElem->getOperand(0))) return false; + InsertElementInst *IE = FirstInsertElem; while (true) { - Ops.push_back(IE->getOperand(1)); + BuildVector.push_back(IE); + BuildVectorOpds.push_back(IE->getOperand(1)); if (IE->use_empty()) return false; - InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->use_back()); + InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back()); if (!NextUse) return true; @@ -2512,7 +2929,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { break; } - // Start over at the next instruction of a differnt type (or the end). + // Start over at the next instruction of a different type (or the end). IncIt = SameTypeIt; } } @@ -2535,7 +2952,8 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { Value *Rdx = (P->getIncomingBlock(0) == BB ? (P->getIncomingValue(0)) - : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : 0)); + : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) + : nullptr)); // Check if this is a Binary Operator. BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); if (!BI) @@ -2574,7 +2992,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(SI->getValueOperand())) { HorizontalReduction HorRdx; - if (((HorRdx.matchAssociativeReduction(0, BinOp, DL) && + if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) && HorRdx.tryToReduce(R, TTI)) || tryToVectorize(BinOp, R))) { Changed = true; @@ -2610,12 +3028,16 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } // Try to vectorize trees that start at insertelement instructions. - if (InsertElementInst *IE = dyn_cast<InsertElementInst>(it)) { - SmallVector<Value *, 8> Ops; - if (!findBuildVector(IE, Ops)) + if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) { + SmallVector<Value *, 16> BuildVector; + SmallVector<Value *, 16> BuildVectorOpds; + if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) continue; - if (tryToVectorizeList(Ops, R)) { + // Vectorize starting with the build vector operands ignoring the + // BuildVector instructions for the purpose of scheduling and user + // extraction. + if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { Changed = true; it = BB->begin(); e = BB->end(); diff --git a/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp index a927fe1..d459bcf 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp @@ -17,7 +17,7 @@ #include "llvm-c/Initialization.h" #include "llvm-c/Transforms/Vectorize.h" #include "llvm/Analysis/Passes.h" -#include "llvm/Analysis/Verifier.h" +#include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" #include "llvm/PassManager.h" |