diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms')
109 files changed, 17921 insertions, 9380 deletions
diff --git a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index e6fa4ed..df08091 100644 --- a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -88,7 +88,7 @@ char ArgPromotion::ID = 0; INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion", "Promote 'by reference' arguments to scalars", false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_AG_DEPENDENCY(CallGraph) +INITIALIZE_PASS_DEPENDENCY(CallGraph) INITIALIZE_PASS_END(ArgPromotion, "argpromotion", "Promote 'by reference' arguments to scalars", false, false) @@ -126,12 +126,10 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { if (!F || !F->hasLocalLinkage()) return 0; // First check: see if there are any pointer arguments! If not, quick exit. - SmallVector<std::pair<Argument*, unsigned>, 16> PointerArgs; - unsigned ArgNo = 0; - for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I, ++ArgNo) + SmallVector<Argument*, 16> PointerArgs; + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I) if (I->getType()->isPointerTy()) - PointerArgs.push_back(std::pair<Argument*, unsigned>(I, ArgNo)); + PointerArgs.push_back(I); if (PointerArgs.empty()) return 0; // Second check: make sure that all callers are direct callers. We can't @@ -152,15 +150,13 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { // add it to ArgsToPromote. SmallPtrSet<Argument*, 8> ArgsToPromote; SmallPtrSet<Argument*, 8> ByValArgsToTransform; - for (unsigned i = 0; i != PointerArgs.size(); ++i) { - bool isByVal=F->getAttributes(). - hasAttribute(PointerArgs[i].second+1, Attribute::ByVal); - Argument *PtrArg = PointerArgs[i].first; + for (unsigned i = 0, e = PointerArgs.size(); i != e; ++i) { + Argument *PtrArg = PointerArgs[i]; Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType(); // If this is a byval argument, and if the aggregate type is small, just // pass the elements, which is always safe. - if (isByVal) { + if (PtrArg->hasByValAttr()) { if (StructType *STy = dyn_cast<StructType>(AgTy)) { if (maxElements > 0 && STy->getNumElements() > maxElements) { DEBUG(dbgs() << "argpromotion disable promoting argument '" @@ -205,7 +201,7 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { } // Otherwise, see if we can promote the pointer to its value. - if (isSafeToPromoteArgument(PtrArg, isByVal)) + if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValAttr())) ArgsToPromote.insert(PtrArg); } @@ -221,8 +217,7 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { static bool AllCallersPassInValidPointerForArgument(Argument *Arg) { Function *Callee = Arg->getParent(); - unsigned ArgNo = std::distance(Callee->arg_begin(), - Function::arg_iterator(Arg)); + unsigned ArgNo = Arg->getArgNo(); // Look at all call sites of the function. At this pointer we know we only // have direct callees. @@ -509,7 +504,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // OriginalLoads - Keep track of a representative load instruction from the // original function so that we can tell the alias analysis implementation // what the new GEP/Load instructions we are inserting look like. - std::map<IndicesVector, LoadInst*> OriginalLoads; + // We need to keep the original loads for each argument and the elements + // of the argument that are accessed. + std::map<std::pair<Argument*, IndicesVector>, LoadInst*> OriginalLoads; // Attribute - Keep track of the parameter attributes for the arguments // that we are *not* promoting. For the ones that we do promote, the parameter @@ -574,7 +571,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, else // Take any load, we will use it only to update Alias Analysis OrigLoad = cast<LoadInst>(User->use_back()); - OriginalLoads[Indices] = OrigLoad; + OriginalLoads[std::make_pair(I, Indices)] = OrigLoad; } // Add a parameter to the function for each element passed in. @@ -681,7 +678,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, for (ScalarizeTable::iterator SI = ArgIndices.begin(), E = ArgIndices.end(); SI != E; ++SI) { Value *V = *AI; - LoadInst *OrigLoad = OriginalLoads[*SI]; + LoadInst *OrigLoad = OriginalLoads[std::make_pair(I, *SI)]; if (!SI->empty()) { Ops.reserve(SI->size()); Type *ElTy = V->getType(); diff --git a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp index a7bf188..d94c0f4 100644 --- a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp @@ -93,9 +93,12 @@ bool ConstantMerge::hasKnownAlignment(GlobalVariable *GV) const { } unsigned ConstantMerge::getAlignment(GlobalVariable *GV) const { + unsigned Align = GV->getAlignment(); + if (Align) + return Align; if (TD) return TD->getPreferredAlignment(GV); - return GV->getAlignment(); + return 0; } bool ConstantMerge::runOnModule(Module &M) { @@ -210,9 +213,9 @@ bool ConstantMerge::runOnModule(Module &M) { // Bump the alignment if necessary. if (Replacements[i].first->getAlignment() || Replacements[i].second->getAlignment()) { - Replacements[i].second->setAlignment(std::max( - Replacements[i].first->getAlignment(), - Replacements[i].second->getAlignment())); + Replacements[i].second->setAlignment( + std::max(getAlignment(Replacements[i].first), + getAlignment(Replacements[i].second))); } // Eliminate any uses of the dead global. diff --git a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index 49ef1e7..911c14e 100644 --- a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -211,7 +211,9 @@ void DAE::CollectFunctionDIs(Module &M) { for (unsigned SPIndex = 0, SPNum = SPs.getNumElements(); SPIndex < SPNum; ++SPIndex) { DISubprogram SP(SPs.getElement(SPIndex)); - if (!SP.Verify()) + assert((!SP || SP.isSubprogram()) && + "A MDNode in subprograms of a CU should be null or a DISubprogram."); + if (!SP) continue; if (Function *F = SP.getFunction()) FunctionDIs[F] = SP; @@ -263,8 +265,10 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { // to pass in a smaller number of arguments into the new function. // std::vector<Value*> Args; - while (!Fn.use_empty()) { - CallSite CS(Fn.use_back()); + for (Value::use_iterator I = Fn.use_begin(), E = Fn.use_end(); I != E; ) { + CallSite CS(*I++); + if (!CS) + continue; Instruction *Call = CS.getInstruction(); // Pass all the same arguments. @@ -330,6 +334,11 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { if (DI != FunctionDIs.end()) DI->second.replaceFunction(NF); + // Fix up any BlockAddresses that refer to the function. + Fn.replaceAllUsesWith(ConstantExpr::getBitCast(NF, Fn.getType())); + // Delete the bitcast that we just created, so that NF does not + // appear to be address-taken. + NF->removeDeadConstantUsers(); // Finally, nuke the old function. Fn.eraseFromParent(); return true; @@ -343,8 +352,22 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn) if (Fn.isDeclaration() || Fn.mayBeOverridden()) return false; - // Functions with local linkage should already have been handled. - if (Fn.hasLocalLinkage()) + // Functions with local linkage should already have been handled, except the + // fragile (variadic) ones which we can improve here. + if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg()) + return false; + + // If a function seen at compile time is not necessarily the one linked to + // the binary being built, it is illegal to change the actual arguments + // passed to it. These functions can be captured by isWeakForLinker(). + // *NOTE* that mayBeOverridden() is insufficient for this purpose as it + // doesn't include linkage types like AvailableExternallyLinkage and + // LinkOnceODRLinkage. Take link_odr* as an example, it indicates a set of + // *EQUIVALENT* globals that can be merged at link-time. However, the + // semantic of *EQUIVALENT*-functions includes parameters. Changing + // parameters breaks this assumption. + // + if (Fn.isWeakForLinker()) return false; if (Fn.use_empty()) @@ -604,9 +627,20 @@ void DAE::SurveyFunction(const Function &F) { UseVector MaybeLiveArgUses; for (Function::const_arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E; ++AI, ++i) { - // See what the effect of this use is (recording any uses that cause - // MaybeLive in MaybeLiveArgUses). - Liveness Result = SurveyUses(AI, MaybeLiveArgUses); + Liveness Result; + if (F.getFunctionType()->isVarArg()) { + // Variadic functions will already have a va_arg function expanded inside + // them, making them potentially very sensitive to ABI changes resulting + // from removing arguments entirely, so don't. For example AArch64 handles + // register and stack HFAs very differently, and this is reflected in the + // IR which has already been generated. + Result = Live; + } else { + // See what the effect of this use is (recording any uses that cause + // MaybeLive in MaybeLiveArgUses). + Result = SurveyUses(AI, MaybeLiveArgUses); + } + // Mark the result. MarkValue(CreateArg(&F, i), Result, MaybeLiveArgUses); // Clear the vector again for the next iteration. @@ -695,10 +729,42 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { FunctionType *FTy = F->getFunctionType(); std::vector<Type*> Params; + // Keep track of if we have a live 'returned' argument + bool HasLiveReturnedArg = false; + // Set up to build a new list of parameter attributes. SmallVector<AttributeSet, 8> AttributesVec; const AttributeSet &PAL = F->getAttributes(); + // Remember which arguments are still alive. + SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false); + // Construct the new parameter list from non-dead arguments. Also construct + // a new set of parameter attributes to correspond. Skip the first parameter + // attribute, since that belongs to the return value. + unsigned i = 0; + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; ++I, ++i) { + RetOrArg Arg = CreateArg(F, i); + if (LiveValues.erase(Arg)) { + Params.push_back(I->getType()); + ArgAlive[i] = true; + + // Get the original parameter attributes (skipping the first one, that is + // for the return value. + if (PAL.hasAttributes(i + 1)) { + AttrBuilder B(PAL, i + 1); + if (B.contains(Attribute::Returned)) + HasLiveReturnedArg = true; + AttributesVec. + push_back(AttributeSet::get(F->getContext(), Params.size(), B)); + } + } else { + ++NumArgumentsEliminated; + DEBUG(dbgs() << "DAE - Removing argument " << i << " (" << I->getName() + << ") from " << F->getName() << "\n"); + } + } + // Find out the new return value. Type *RetTy = FTy->getReturnType(); Type *NRetTy = NULL; @@ -707,7 +773,27 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { // -1 means unused, other numbers are the new index SmallVector<int, 5> NewRetIdxs(RetCount, -1); std::vector<Type*> RetTypes; - if (RetTy->isVoidTy()) { + + // If there is a function with a live 'returned' argument but a dead return + // value, then there are two possible actions: + // 1) Eliminate the return value and take off the 'returned' attribute on the + // argument. + // 2) Retain the 'returned' attribute and treat the return value (but not the + // entire function) as live so that it is not eliminated. + // + // It's not clear in the general case which option is more profitable because, + // even in the absence of explicit uses of the return value, code generation + // is free to use the 'returned' attribute to do things like eliding + // save/restores of registers across calls. Whether or not this happens is + // target and ABI-specific as well as depending on the amount of register + // pressure, so there's no good way for an IR-level pass to figure this out. + // + // Fortunately, the only places where 'returned' is currently generated by + // the FE are places where 'returned' is basically free and almost always a + // performance win, so the second option can just be used always for now. + // + // This should be revisited if 'returned' is ever applied more liberally. + if (RetTy->isVoidTy() || HasLiveReturnedArg) { NRetTy = RetTy; } else { StructType *STy = dyn_cast<StructType>(RetTy); @@ -777,33 +863,6 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { if (RAttrs.hasAttributes(AttributeSet::ReturnIndex)) AttributesVec.push_back(AttributeSet::get(NRetTy->getContext(), RAttrs)); - // Remember which arguments are still alive. - SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false); - // Construct the new parameter list from non-dead arguments. Also construct - // a new set of parameter attributes to correspond. Skip the first parameter - // attribute, since that belongs to the return value. - unsigned i = 0; - for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I, ++i) { - RetOrArg Arg = CreateArg(F, i); - if (LiveValues.erase(Arg)) { - Params.push_back(I->getType()); - ArgAlive[i] = true; - - // Get the original parameter attributes (skipping the first one, that is - // for the return value. - if (PAL.hasAttributes(i + 1)) { - AttrBuilder B(PAL, i + 1); - AttributesVec. - push_back(AttributeSet::get(F->getContext(), Params.size(), B)); - } - } else { - ++NumArgumentsEliminated; - DEBUG(dbgs() << "DAE - Removing argument " << i << " (" << I->getName() - << ") from " << F->getName() << "\n"); - } - } - if (PAL.hasAttributes(AttributeSet::FunctionIndex)) AttributesVec.push_back(AttributeSet::get(F->getContext(), PAL.getFnAttributes())); @@ -864,6 +923,13 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { // Get original parameter attributes, but skip return attributes. if (CallPAL.hasAttributes(i + 1)) { AttrBuilder B(CallPAL, i + 1); + // If the return type has changed, then get rid of 'returned' on the + // call site. The alternative is to make all 'returned' attributes on + // call sites keep the return value alive just like 'returned' + // attributes on function declaration but it's less clearly a win + // and this is not an expected case anyway + if (NRetTy != RetTy && B.contains(Attribute::Returned)) + B.removeAttribute(Attribute::Returned); AttributesVec. push_back(AttributeSet::get(F->getContext(), Args.size(), B)); } diff --git a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp index fa3d72d..50fb3e6 100644 --- a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp @@ -21,6 +21,38 @@ #include <algorithm> using namespace llvm; +/// Make sure GV is visible from both modules. Delete is true if it is +/// being deleted from this module. +/// This also makes sure GV cannot be dropped so that references from +/// the split module remain valid. +static void makeVisible(GlobalValue &GV, bool Delete) { + bool Local = GV.hasLocalLinkage(); + if (Local) + GV.setVisibility(GlobalValue::HiddenVisibility); + + if (Local || Delete) { + GV.setLinkage(GlobalValue::ExternalLinkage); + return; + } + + if (!GV.hasLinkOnceLinkage()) { + assert(!GV.isDiscardableIfUnused()); + return; + } + + // Map linkonce* to weak* so that llvm doesn't drop this GV. + switch(GV.getLinkage()) { + default: + llvm_unreachable("Unexpected linkage"); + case GlobalValue::LinkOnceAnyLinkage: + GV.setLinkage(GlobalValue::WeakAnyLinkage); + return; + case GlobalValue::LinkOnceODRLinkage: + GV.setLinkage(GlobalValue::WeakODRLinkage); + return; + } +} + namespace { /// @brief A pass to extract specific functions and their dependencies. class GVExtractorPass : public ModulePass { @@ -60,12 +92,7 @@ namespace { continue; } - bool Local = I->isDiscardableIfUnused(); - if (Local) - I->setVisibility(GlobalValue::HiddenVisibility); - - if (Local || Delete) - I->setLinkage(GlobalValue::ExternalLinkage); + makeVisible(*I, Delete); if (Delete) I->setInitializer(0); @@ -80,12 +107,7 @@ namespace { continue; } - bool Local = I->isDiscardableIfUnused(); - if (Local) - I->setVisibility(GlobalValue::HiddenVisibility); - - if (Local || Delete) - I->setLinkage(GlobalValue::ExternalLinkage); + makeVisible(*I, Delete); if (Delete) I->deleteBody(); @@ -97,12 +119,10 @@ namespace { Module::alias_iterator CurI = I; ++I; - if (CurI->isDiscardableIfUnused()) { - CurI->setVisibility(GlobalValue::HiddenVisibility); - CurI->setLinkage(GlobalValue::ExternalLinkage); - } + bool Delete = deleteStuff == (bool)Named.count(CurI); + makeVisible(*CurI, Delete); - if (deleteStuff == (bool)Named.count(CurI)) { + if (Delete) { Type *Ty = CurI->getType()->getElementType(); CurI->removeFromParent(); diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index bc5109b..60e5f06 100644 --- a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -9,14 +9,12 @@ // // This file implements a simple interprocedural pass which walks the // call-graph, looking for functions which do not access or only read -// non-local memory, and marking them readnone/readonly. In addition, -// it marks function arguments (of pointer type) 'nocapture' if a call -// to the function does not create any copies of the pointer value that -// outlive the call. This more or less means that the pointer is only -// dereferenced, and not returned from the function or stored in a global. -// Finally, well-known library call declarations are marked with all -// attributes that are consistent with the function's standard definition. -// This pass is implemented as a bottom-up traversal of the call-graph. +// non-local memory, and marking them readnone/readonly. It does the +// same with function arguments independently, marking them readonly/ +// readnone/nocapture. Finally, well-known library call declarations +// are marked with all attributes that are consistent with the +// function's standard definition. This pass is implemented as a +// bottom-up traversal of the call-graph. // //===----------------------------------------------------------------------===// @@ -40,6 +38,8 @@ using namespace llvm; STATISTIC(NumReadNone, "Number of functions marked readnone"); STATISTIC(NumReadOnly, "Number of functions marked readonly"); STATISTIC(NumNoCapture, "Number of arguments marked nocapture"); +STATISTIC(NumReadNoneArg, "Number of arguments marked readnone"); +STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly"); STATISTIC(NumNoAlias, "Number of function returns marked noalias"); STATISTIC(NumAnnotated, "Number of attributes added to library functions"); @@ -56,8 +56,8 @@ namespace { // AddReadAttrs - Deduce readonly/readnone attributes for the SCC. bool AddReadAttrs(const CallGraphSCC &SCC); - // AddNoCaptureAttrs - Deduce nocapture attributes for the SCC. - bool AddNoCaptureAttrs(const CallGraphSCC &SCC); + // AddArgumentAttrs - Deduce nocapture attributes for the SCC. + bool AddArgumentAttrs(const CallGraphSCC &SCC); // IsFunctionMallocLike - Does this function allocate new memory? bool IsFunctionMallocLike(Function *F, @@ -71,36 +71,43 @@ namespace { void setDoesNotAccessMemory(Function &F) { if (!F.doesNotAccessMemory()) { - F.setDoesNotAccessMemory(); - ++NumAnnotated; + F.setDoesNotAccessMemory(); + ++NumAnnotated; } } void setOnlyReadsMemory(Function &F) { if (!F.onlyReadsMemory()) { - F.setOnlyReadsMemory(); - ++NumAnnotated; + F.setOnlyReadsMemory(); + ++NumAnnotated; } } void setDoesNotThrow(Function &F) { if (!F.doesNotThrow()) { - F.setDoesNotThrow(); - ++NumAnnotated; + F.setDoesNotThrow(); + ++NumAnnotated; } } void setDoesNotCapture(Function &F, unsigned n) { if (!F.doesNotCapture(n)) { - F.setDoesNotCapture(n); - ++NumAnnotated; + F.setDoesNotCapture(n); + ++NumAnnotated; + } + } + + void setOnlyReadsMemory(Function &F, unsigned n) { + if (!F.onlyReadsMemory(n)) { + F.setOnlyReadsMemory(n); + ++NumAnnotated; } } void setDoesNotAlias(Function &F, unsigned n) { if (!F.doesNotAlias(n)) { - F.setDoesNotAlias(n); - ++NumAnnotated; + F.setDoesNotAlias(n); + ++NumAnnotated; } } @@ -129,7 +136,8 @@ namespace { char FunctionAttrs::ID = 0; INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs", "Deduce function attributes", false, false) -INITIALIZE_AG_DEPENDENCY(CallGraph) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(CallGraph) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_PASS_END(FunctionAttrs, "functionattrs", "Deduce function attributes", false, false) @@ -343,6 +351,7 @@ namespace { Function *F = CS.getCalledFunction(); if (!F || !SCCNodes.count(F)) { Captured = true; return true; } + bool Found = false; Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end(); for (CallSite::arg_iterator PI = CS.arg_begin(), PE = CS.arg_end(); PI != PE; ++PI, ++AI) { @@ -353,10 +362,12 @@ namespace { } if (PI == U) { Uses.push_back(AI); + Found = true; break; } } - assert(!Uses.empty() && "Capturing call-site captured nothing?"); + assert(Found && "Capturing call-site captured nothing?"); + (void)Found; return false; } @@ -394,8 +405,100 @@ namespace llvm { }; } -/// AddNoCaptureAttrs - Deduce nocapture attributes for the SCC. -bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) { +// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone. +static Attribute::AttrKind +determinePointerReadAttrs(Argument *A, + const SmallPtrSet<Argument*, 8> &SCCNodes) { + + SmallVector<Use*, 32> Worklist; + SmallSet<Use*, 32> Visited; + int Count = 0; + + bool IsRead = false; + // We don't need to track IsWritten. If A is written to, return immediately. + + for (Value::use_iterator UI = A->use_begin(), UE = A->use_end(); + UI != UE; ++UI) { + if (Count++ >= 20) + return Attribute::None; + + Use *U = &UI.getUse(); + Visited.insert(U); + Worklist.push_back(U); + } + + while (!Worklist.empty()) { + Use *U = Worklist.pop_back_val(); + Instruction *I = cast<Instruction>(U->getUser()); + Value *V = U->get(); + + switch (I->getOpcode()) { + case Instruction::BitCast: + case Instruction::GetElementPtr: + case Instruction::PHI: + case Instruction::Select: + // The original value is not read/written via this if the new value isn't. + for (Instruction::use_iterator UI = I->use_begin(), UE = I->use_end(); + UI != UE; ++UI) { + Use *U = &UI.getUse(); + if (Visited.insert(U)) + Worklist.push_back(U); + } + break; + + case Instruction::Call: + case Instruction::Invoke: { + CallSite CS(I); + if (CS.doesNotAccessMemory()) + continue; + + Function *F = CS.getCalledFunction(); + if (!F) { + if (CS.onlyReadsMemory()) { + IsRead = true; + continue; + } + return Attribute::None; + } + + Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end(); + CallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end(); + for (CallSite::arg_iterator A = B; A != E; ++A, ++AI) { + if (A->get() == V) { + if (AI == AE) { + assert(F->isVarArg() && + "More params than args in non-varargs call."); + return Attribute::None; + } + if (SCCNodes.count(AI)) + continue; + if (!CS.onlyReadsMemory() && !CS.onlyReadsMemory(A - B)) + return Attribute::None; + if (!CS.doesNotAccessMemory(A - B)) + IsRead = true; + } + } + break; + } + + case Instruction::Load: + IsRead = true; + break; + + case Instruction::ICmp: + case Instruction::Ret: + break; + + default: + return Attribute::None; + } + } + + return IsRead ? Attribute::ReadOnly : Attribute::ReadNone; +} + +/// AddArgumentAttrs - Deduce nocapture attributes for the SCC. +bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { bool Changed = false; SmallPtrSet<Function*, 8> SCCNodes; @@ -442,8 +545,11 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) { continue; } - for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A!=E; ++A) - if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) { + for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); + A != E; ++A) { + if (!A->getType()->isPointerTy()) continue; + bool HasNonLocalUses = false; + if (!A->hasNoCaptureAttr()) { ArgumentUsesTracker Tracker(SCCNodes); PointerMayBeCaptured(A, &Tracker); if (!Tracker.Captured) { @@ -458,12 +564,32 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) { // its particulars for Argument-SCC analysis later. ArgumentGraphNode *Node = AG[A]; for (SmallVectorImpl<Argument*>::iterator UI = Tracker.Uses.begin(), - UE = Tracker.Uses.end(); UI != UE; ++UI) + UE = Tracker.Uses.end(); UI != UE; ++UI) { Node->Uses.push_back(AG[*UI]); + if (*UI != A) + HasNonLocalUses = true; + } } } // Otherwise, it's captured. Don't bother doing SCC analysis on it. } + if (!HasNonLocalUses && !A->onlyReadsMemory()) { + // Can we determine that it's readonly/readnone without doing an SCC? + // Note that we don't allow any calls at all here, or else our result + // will be dependent on the iteration order through the functions in the + // SCC. + SmallPtrSet<Argument*, 8> Self; + Self.insert(A); + Attribute::AttrKind R = determinePointerReadAttrs(A, Self); + if (R != Attribute::None) { + AttrBuilder B; + B.addAttribute(R); + A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B)); + Changed = true; + R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg; + } + } + } } // The graph we've collected is partial because we stopped scanning for @@ -482,11 +608,8 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) { // eg. "void f(int* x) { if (...) f(x); }" if (ArgumentSCC[0]->Uses.size() == 1 && ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) { - ArgumentSCC[0]-> - Definition-> - addAttr(AttributeSet::get(ArgumentSCC[0]->Definition->getContext(), - ArgumentSCC[0]->Definition->getArgNo() + 1, - B)); + Argument *A = ArgumentSCC[0]->Definition; + A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B)); ++NumNoCapture; Changed = true; } @@ -532,6 +655,42 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) { ++NumNoCapture; Changed = true; } + + // We also want to compute readonly/readnone. With a small number of false + // negatives, we can assume that any pointer which is captured isn't going + // to be provably readonly or readnone, since by definition we can't + // analyze all uses of a captured pointer. + // + // The false negatives happen when the pointer is captured by a function + // that promises readonly/readnone behaviour on the pointer, then the + // pointer's lifetime ends before anything that writes to arbitrary memory. + // Also, a readonly/readnone pointer may be returned, but returning a + // pointer is capturing it. + + Attribute::AttrKind ReadAttr = Attribute::ReadNone; + for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { + Argument *A = ArgumentSCC[i]->Definition; + Attribute::AttrKind K = determinePointerReadAttrs(A, ArgumentSCCNodes); + if (K == Attribute::ReadNone) + continue; + if (K == Attribute::ReadOnly) { + ReadAttr = Attribute::ReadOnly; + continue; + } + ReadAttr = K; + break; + } + + if (ReadAttr != Attribute::None) { + AttrBuilder B; + B.addAttribute(ReadAttr); + for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { + Argument *A = ArgumentSCC[i]->Definition; + A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B)); + ReadAttr == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg; + Changed = true; + } + } } return Changed; @@ -678,24 +837,32 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setOnlyReadsMemory(F); setDoesNotThrow(F); break; - case LibFunc::strcpy: - case LibFunc::stpcpy: - case LibFunc::strcat: case LibFunc::strtol: case LibFunc::strtod: case LibFunc::strtof: case LibFunc::strtoul: case LibFunc::strtoll: case LibFunc::strtold: + case LibFunc::strtoull: + if (FTy->getNumParams() < 2 || + !FTy->getParamType(1)->isPointerTy()) + return false; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 1); + break; + case LibFunc::strcpy: + case LibFunc::stpcpy: + case LibFunc::strcat: case LibFunc::strncat: case LibFunc::strncpy: case LibFunc::stpncpy: - case LibFunc::strtoull: if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) return false; setDoesNotThrow(F); setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 2); break; case LibFunc::strxfrm: if (FTy->getNumParams() != 3 || @@ -705,14 +872,15 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotCapture(F, 1); setDoesNotCapture(F, 2); - break; - case LibFunc::strcmp: - case LibFunc::strspn: - case LibFunc::strncmp: - case LibFunc::strcspn: - case LibFunc::strcoll: - case LibFunc::strcasecmp: - case LibFunc::strncasecmp: + setOnlyReadsMemory(F, 2); + break; + case LibFunc::strcmp: //0,1 + case LibFunc::strspn: // 0,1 + case LibFunc::strncmp: // 0,1 + case LibFunc::strcspn: //0,1 + case LibFunc::strcoll: //0,1 + case LibFunc::strcasecmp: // 0,1 + case LibFunc::strncasecmp: // if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || !FTy->getParamType(1)->isPointerTy()) @@ -736,8 +904,15 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { return false; setDoesNotThrow(F); setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 2); break; case LibFunc::scanf: + if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); + break; case LibFunc::setbuf: case LibFunc::setvbuf: if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) @@ -753,11 +928,31 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotAlias(F, 0); setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); break; case LibFunc::stat: + case LibFunc::statvfs: + if (FTy->getNumParams() < 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 1); + break; case LibFunc::sscanf: + if (FTy->getNumParams() < 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 1); + setOnlyReadsMemory(F, 2); + break; case LibFunc::sprintf: - case LibFunc::statvfs: if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || !FTy->getParamType(1)->isPointerTy()) @@ -765,6 +960,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotCapture(F, 1); setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 2); break; case LibFunc::snprintf: if (FTy->getNumParams() != 3 || @@ -774,6 +970,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotCapture(F, 1); setDoesNotCapture(F, 3); + setOnlyReadsMemory(F, 3); break; case LibFunc::setitimer: if (FTy->getNumParams() != 3 || @@ -783,6 +980,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotCapture(F, 2); setDoesNotCapture(F, 3); + setOnlyReadsMemory(F, 2); break; case LibFunc::system: if (FTy->getNumParams() != 1 || @@ -790,6 +988,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { return false; // May throw; "system" is a valid pthread cancellation point. setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); break; case LibFunc::malloc: if (FTy->getNumParams() != 1 || @@ -818,6 +1017,12 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { case LibFunc::modf: case LibFunc::modff: case LibFunc::modfl: + if (FTy->getNumParams() < 2 || + !FTy->getParamType(1)->isPointerTy()) + return false; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + break; case LibFunc::memcpy: case LibFunc::memccpy: case LibFunc::memmove: @@ -826,6 +1031,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { return false; setDoesNotThrow(F); setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 2); break; case LibFunc::memalign: if (!FTy->getReturnType()->isPointerTy()) @@ -833,6 +1039,13 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotAlias(F, 0); break; case LibFunc::mkdir: + if (FTy->getNumParams() == 0 || + !FTy->getParamType(0)->isPointerTy()) + return false; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); + break; case LibFunc::mktime: if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) @@ -856,8 +1069,14 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { // May throw; "read" is a valid pthread cancellation point. setDoesNotCapture(F, 2); break; - case LibFunc::rmdir: case LibFunc::rewind: + if (FTy->getNumParams() < 1 || + !FTy->getParamType(0)->isPointerTy()) + return false; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + break; + case LibFunc::rmdir: case LibFunc::remove: case LibFunc::realpath: if (FTy->getNumParams() < 1 || @@ -865,8 +1084,19 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { return false; setDoesNotThrow(F); setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); break; case LibFunc::rename: + if (FTy->getNumParams() < 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 1); + setOnlyReadsMemory(F, 2); + break; case LibFunc::readlink: if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || @@ -875,12 +1105,14 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotCapture(F, 1); setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 1); break; case LibFunc::write: if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy()) return false; // May throw; "write" is a valid pthread cancellation point. setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 2); break; case LibFunc::bcopy: if (FTy->getNumParams() != 3 || @@ -890,6 +1122,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotCapture(F, 1); setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 1); break; case LibFunc::bcmp: if (FTy->getNumParams() != 3 || @@ -916,6 +1149,12 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { break; case LibFunc::chmod: case LibFunc::chown: + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return false; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); + break; case LibFunc::ctermid: case LibFunc::clearerr: case LibFunc::closedir: @@ -939,6 +1178,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { return false; setDoesNotThrow(F); setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); break; case LibFunc::fopen: if (FTy->getNumParams() != 2 || @@ -950,6 +1190,8 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotAlias(F, 0); setDoesNotCapture(F, 1); setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 1); + setOnlyReadsMemory(F, 2); break; case LibFunc::fdopen: if (FTy->getNumParams() != 2 || @@ -959,6 +1201,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotAlias(F, 0); setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 2); break; case LibFunc::feof: case LibFunc::free: @@ -1004,7 +1247,16 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { return false; setDoesNotThrow(F); setDoesNotCapture(F, 3); + break; case LibFunc::fread: + if (FTy->getNumParams() != 4 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(3)->isPointerTy()) + return false; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 4); + break; case LibFunc::fwrite: if (FTy->getNumParams() != 4 || !FTy->getParamType(0)->isPointerTy() || @@ -1013,9 +1265,28 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotCapture(F, 1); setDoesNotCapture(F, 4); + break; case LibFunc::fputs: + if (FTy->getNumParams() < 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 1); + break; case LibFunc::fscanf: case LibFunc::fprintf: + if (FTy->getNumParams() < 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 2); + break; case LibFunc::fgetpos: if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || @@ -1055,6 +1326,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { return false; setDoesNotThrow(F); setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); break; case LibFunc::ungetc: if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) @@ -1063,12 +1335,24 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotCapture(F, 2); break; case LibFunc::uname: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + break; case LibFunc::unlink: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); + break; case LibFunc::unsetenv: if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) return false; setDoesNotThrow(F); setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); break; case LibFunc::utime: case LibFunc::utimes: @@ -1079,6 +1363,8 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotCapture(F, 1); setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 1); + setOnlyReadsMemory(F, 2); break; case LibFunc::putc: if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) @@ -1093,13 +1379,20 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { return false; setDoesNotThrow(F); setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); break; case LibFunc::pread: + if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy()) + return false; + // May throw; "pread" is a valid pthread cancellation point. + setDoesNotCapture(F, 2); + break; case LibFunc::pwrite: if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy()) return false; - // May throw; these are valid pthread cancellation points. + // May throw; "pwrite" is a valid pthread cancellation point. setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 2); break; case LibFunc::putchar: setDoesNotThrow(F); @@ -1114,6 +1407,8 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotAlias(F, 0); setDoesNotCapture(F, 1); setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 1); + setOnlyReadsMemory(F, 2); break; case LibFunc::pclose: if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) @@ -1126,8 +1421,19 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { return false; setDoesNotThrow(F); setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); break; case LibFunc::vsscanf: + if (FTy->getNumParams() != 3 || + !FTy->getParamType(1)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return false; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 1); + setOnlyReadsMemory(F, 2); + break; case LibFunc::vfscanf: if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy() || @@ -1136,6 +1442,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotCapture(F, 1); setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 2); break; case LibFunc::valloc: if (!FTy->getReturnType()->isPointerTy()) @@ -1148,6 +1455,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { return false; setDoesNotThrow(F); setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); break; case LibFunc::vfprintf: case LibFunc::vsprintf: @@ -1158,6 +1466,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotCapture(F, 1); setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 2); break; case LibFunc::vsnprintf: if (FTy->getNumParams() != 4 || @@ -1167,12 +1476,14 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotCapture(F, 1); setDoesNotCapture(F, 3); + setOnlyReadsMemory(F, 3); break; case LibFunc::open: if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy()) return false; // May throw; "open" is a valid pthread cancellation point. setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); break; case LibFunc::opendir: if (FTy->getNumParams() != 1 || @@ -1182,6 +1493,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotAlias(F, 0); setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); break; case LibFunc::tmpfile: if (!FTy->getReturnType()->isPointerTy()) @@ -1210,12 +1522,14 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotCapture(F, 1); setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 1); break; case LibFunc::lchown: if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy()) return false; setDoesNotThrow(F); setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); break; case LibFunc::qsort: if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy()) @@ -1232,6 +1546,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotAlias(F, 0); setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); break; case LibFunc::dunder_strtok_r: if (FTy->getNumParams() != 3 || @@ -1239,6 +1554,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { return false; setDoesNotThrow(F); setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 2); break; case LibFunc::under_IO_getc: if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) @@ -1258,10 +1574,20 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { return false; setDoesNotThrow(F); setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); break; case LibFunc::stat64: case LibFunc::lstat64: case LibFunc::statvfs64: + if (FTy->getNumParams() < 1 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 1); + break; case LibFunc::dunder_isoc99_sscanf: if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy() || @@ -1270,6 +1596,8 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotThrow(F); setDoesNotCapture(F, 1); setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 1); + setOnlyReadsMemory(F, 2); break; case LibFunc::fopen64: if (FTy->getNumParams() != 2 || @@ -1281,6 +1609,8 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { setDoesNotAlias(F, 0); setDoesNotCapture(F, 1); setDoesNotCapture(F, 2); + setOnlyReadsMemory(F, 1); + setOnlyReadsMemory(F, 2); break; case LibFunc::fseeko64: case LibFunc::ftello64: @@ -1307,7 +1637,18 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) { return false; // May throw; "open" is a valid pthread cancellation point. setDoesNotCapture(F, 1); + setOnlyReadsMemory(F, 1); break; + case LibFunc::gettimeofday: + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + // Currently some platforms have the restrict keyword on the arguments to + // gettimeofday. To be conservative, do not add noalias to gettimeofday's + // arguments. + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); default: // Didn't mark any attributes. return false; @@ -1339,7 +1680,7 @@ bool FunctionAttrs::runOnSCC(CallGraphSCC &SCC) { bool Changed = annotateLibraryCalls(SCC); Changed |= AddReadAttrs(SCC); - Changed |= AddNoCaptureAttrs(SCC); + Changed |= AddArgumentAttrs(SCC); Changed |= AddNoAliasAttrs(SCC); return Changed; } diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp index 201f320..901295d 100644 --- a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp +++ b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp @@ -179,6 +179,9 @@ void GlobalDCE::GlobalIsNeeded(GlobalValue *G) { // any globals used will be marked as needed. Function *F = cast<Function>(G); + if (F->hasPrefixData()) + MarkUsedGlobalsAsNeeded(F->getPrefixData()); + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) for (User::op_iterator U = I->op_begin(), E = I->op_end(); U != E; ++U) diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 0ef900e..2ea89a1 100644 --- a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -37,7 +37,10 @@ #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/ValueHandle.h" #include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/GlobalStatus.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" #include <algorithm> using namespace llvm; @@ -59,7 +62,6 @@ STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated"); STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed"); namespace { - struct GlobalStatus; struct GlobalOpt : public ModulePass { virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<TargetLibraryInfo>(); @@ -79,7 +81,6 @@ namespace { bool OptimizeGlobalCtorsList(GlobalVariable *&GCL); bool ProcessGlobal(GlobalVariable *GV,Module::global_iterator &GVI); bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI, - const SmallPtrSet<const PHINode*, 16> &PHIUsers, const GlobalStatus &GS); bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn); @@ -97,209 +98,6 @@ INITIALIZE_PASS_END(GlobalOpt, "globalopt", ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); } -namespace { - -/// GlobalStatus - As we analyze each global, keep track of some information -/// about it. If we find out that the address of the global is taken, none of -/// this info will be accurate. -struct GlobalStatus { - /// isCompared - True if the global's address is used in a comparison. - bool isCompared; - - /// isLoaded - True if the global is ever loaded. If the global isn't ever - /// loaded it can be deleted. - bool isLoaded; - - /// StoredType - Keep track of what stores to the global look like. - /// - enum StoredType { - /// NotStored - There is no store to this global. It can thus be marked - /// constant. - NotStored, - - /// isInitializerStored - This global is stored to, but the only thing - /// stored is the constant it was initialized with. This is only tracked - /// for scalar globals. - isInitializerStored, - - /// isStoredOnce - This global is stored to, but only its initializer and - /// one other value is ever stored to it. If this global isStoredOnce, we - /// track the value stored to it in StoredOnceValue below. This is only - /// tracked for scalar globals. - isStoredOnce, - - /// isStored - This global is stored to by multiple values or something else - /// that we cannot track. - isStored - } StoredType; - - /// StoredOnceValue - If only one value (besides the initializer constant) is - /// ever stored to this global, keep track of what value it is. - Value *StoredOnceValue; - - /// AccessingFunction/HasMultipleAccessingFunctions - These start out - /// null/false. When the first accessing function is noticed, it is recorded. - /// When a second different accessing function is noticed, - /// HasMultipleAccessingFunctions is set to true. - const Function *AccessingFunction; - bool HasMultipleAccessingFunctions; - - /// HasNonInstructionUser - Set to true if this global has a user that is not - /// an instruction (e.g. a constant expr or GV initializer). - bool HasNonInstructionUser; - - /// AtomicOrdering - Set to the strongest atomic ordering requirement. - AtomicOrdering Ordering; - - GlobalStatus() : isCompared(false), isLoaded(false), StoredType(NotStored), - StoredOnceValue(0), AccessingFunction(0), - HasMultipleAccessingFunctions(false), - HasNonInstructionUser(false), Ordering(NotAtomic) {} -}; - -} - -/// StrongerOrdering - Return the stronger of the two ordering. If the two -/// orderings are acquire and release, then return AcquireRelease. -/// -static AtomicOrdering StrongerOrdering(AtomicOrdering X, AtomicOrdering Y) { - if (X == Acquire && Y == Release) return AcquireRelease; - if (Y == Acquire && X == Release) return AcquireRelease; - return (AtomicOrdering)std::max(X, Y); -} - -/// SafeToDestroyConstant - It is safe to destroy a constant iff it is only used -/// by constants itself. Note that constants cannot be cyclic, so this test is -/// pretty easy to implement recursively. -/// -static bool SafeToDestroyConstant(const Constant *C) { - if (isa<GlobalValue>(C)) return false; - - for (Value::const_use_iterator UI = C->use_begin(), E = C->use_end(); UI != E; - ++UI) - if (const Constant *CU = dyn_cast<Constant>(*UI)) { - if (!SafeToDestroyConstant(CU)) return false; - } else - return false; - return true; -} - - -/// AnalyzeGlobal - Look at all uses of the global and fill in the GlobalStatus -/// structure. If the global has its address taken, return true to indicate we -/// can't do anything with it. -/// -static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS, - SmallPtrSet<const PHINode*, 16> &PHIUsers) { - for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; - ++UI) { - const User *U = *UI; - if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) { - GS.HasNonInstructionUser = true; - - // If the result of the constantexpr isn't pointer type, then we won't - // know to expect it in various places. Just reject early. - if (!isa<PointerType>(CE->getType())) return true; - - if (AnalyzeGlobal(CE, GS, PHIUsers)) return true; - } else if (const Instruction *I = dyn_cast<Instruction>(U)) { - if (!GS.HasMultipleAccessingFunctions) { - const Function *F = I->getParent()->getParent(); - if (GS.AccessingFunction == 0) - GS.AccessingFunction = F; - else if (GS.AccessingFunction != F) - GS.HasMultipleAccessingFunctions = true; - } - if (const LoadInst *LI = dyn_cast<LoadInst>(I)) { - GS.isLoaded = true; - // Don't hack on volatile loads. - if (LI->isVolatile()) return true; - GS.Ordering = StrongerOrdering(GS.Ordering, LI->getOrdering()); - } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) { - // Don't allow a store OF the address, only stores TO the address. - if (SI->getOperand(0) == V) return true; - - // Don't hack on volatile stores. - if (SI->isVolatile()) return true; - - GS.Ordering = StrongerOrdering(GS.Ordering, SI->getOrdering()); - - // If this is a direct store to the global (i.e., the global is a scalar - // value, not an aggregate), keep more specific information about - // stores. - if (GS.StoredType != GlobalStatus::isStored) { - if (const GlobalVariable *GV = dyn_cast<GlobalVariable>( - SI->getOperand(1))) { - Value *StoredVal = SI->getOperand(0); - - if (Constant *C = dyn_cast<Constant>(StoredVal)) { - if (C->isThreadDependent()) { - // The stored value changes between threads; don't track it. - return true; - } - } - - if (StoredVal == GV->getInitializer()) { - if (GS.StoredType < GlobalStatus::isInitializerStored) - GS.StoredType = GlobalStatus::isInitializerStored; - } else if (isa<LoadInst>(StoredVal) && - cast<LoadInst>(StoredVal)->getOperand(0) == GV) { - if (GS.StoredType < GlobalStatus::isInitializerStored) - GS.StoredType = GlobalStatus::isInitializerStored; - } else if (GS.StoredType < GlobalStatus::isStoredOnce) { - GS.StoredType = GlobalStatus::isStoredOnce; - GS.StoredOnceValue = StoredVal; - } else if (GS.StoredType == GlobalStatus::isStoredOnce && - GS.StoredOnceValue == StoredVal) { - // noop. - } else { - GS.StoredType = GlobalStatus::isStored; - } - } else { - GS.StoredType = GlobalStatus::isStored; - } - } - } else if (isa<BitCastInst>(I)) { - if (AnalyzeGlobal(I, GS, PHIUsers)) return true; - } else if (isa<GetElementPtrInst>(I)) { - if (AnalyzeGlobal(I, GS, PHIUsers)) return true; - } else if (isa<SelectInst>(I)) { - if (AnalyzeGlobal(I, GS, PHIUsers)) return true; - } else if (const PHINode *PN = dyn_cast<PHINode>(I)) { - // PHI nodes we can check just like select or GEP instructions, but we - // have to be careful about infinite recursion. - if (PHIUsers.insert(PN)) // Not already visited. - if (AnalyzeGlobal(I, GS, PHIUsers)) return true; - } else if (isa<CmpInst>(I)) { - GS.isCompared = true; - } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) { - if (MTI->isVolatile()) return true; - if (MTI->getArgOperand(0) == V) - GS.StoredType = GlobalStatus::isStored; - if (MTI->getArgOperand(1) == V) - GS.isLoaded = true; - } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) { - assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!"); - if (MSI->isVolatile()) return true; - GS.StoredType = GlobalStatus::isStored; - } else { - return true; // Any other non-load instruction might take address! - } - } else if (const Constant *C = dyn_cast<Constant>(U)) { - GS.HasNonInstructionUser = true; - // We might have a dead and dangling constant hanging off of here. - if (!SafeToDestroyConstant(C)) - return true; - } else { - GS.HasNonInstructionUser = true; - // Otherwise must be some other user. - return true; - } - } - - return false; -} - /// isLeakCheckerRoot - Is this global variable possibly used by a leak checker /// as a root? If so, we might not really want to eliminate the stores to it. static bool isLeakCheckerRoot(GlobalVariable *GV) { @@ -433,7 +231,7 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV, Changed = true; } } else if (Constant *C = dyn_cast<Constant>(U)) { - if (SafeToDestroyConstant(C)) { + if (isSafeToDestroyConstant(C)) { C->destroyConstant(); // This could have invalidated UI, start over from scratch. Dead.clear(); @@ -470,9 +268,17 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV, static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, DataLayout *TD, TargetLibraryInfo *TLI) { bool Changed = false; - SmallVector<User*, 8> WorkList(V->use_begin(), V->use_end()); + // Note that we need to use a weak value handle for the worklist items. When + // we delete a constant array, we may also be holding pointer to one of its + // elements (or an element of one of its elements if we're dealing with an + // array of arrays) in the worklist. + SmallVector<WeakVH, 8> WorkList(V->use_begin(), V->use_end()); while (!WorkList.empty()) { - User *U = WorkList.pop_back_val(); + Value *UV = WorkList.pop_back_val(); + if (!UV) + continue; + + User *U = cast<User>(UV); if (LoadInst *LI = dyn_cast<LoadInst>(U)) { if (Init) { @@ -533,7 +339,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, } else if (Constant *C = dyn_cast<Constant>(U)) { // If we have a chain of dead constantexprs or other things dangling from // us, and if they are all dead, nuke them without remorse. - if (SafeToDestroyConstant(C)) { + if (isSafeToDestroyConstant(C)) { C->destroyConstant(); CleanupConstantGlobalUsers(V, Init, TD, TLI); return true; @@ -548,7 +354,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, static bool isSafeSROAElementUse(Value *V) { // We might have a dead and dangling constant hanging off of here. if (Constant *C = dyn_cast<Constant>(V)) - return SafeToDestroyConstant(C); + return isSafeToDestroyConstant(C); Instruction *I = dyn_cast<Instruction>(V); if (!I) return false; @@ -1372,8 +1178,7 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo, } else if (PHINode *PN = dyn_cast<PHINode>(V)) { // PN's type is pointer to struct. Make a new PHI of pointer to struct // field. - StructType *ST = - cast<StructType>(cast<PointerType>(PN->getType())->getElementType()); + StructType *ST = cast<StructType>(PN->getType()->getPointerElementType()); PHINode *NewPN = PHINode::Create(PointerType::getUnqual(ST->getElementType(FieldNo)), @@ -1504,7 +1309,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, unsigned TypeSize = TD->getTypeAllocSize(FieldTy); if (StructType *ST = dyn_cast<StructType>(FieldTy)) TypeSize = TD->getStructLayout(ST)->getSizeInBytes(); - Type *IntPtrTy = TD->getIntPtrType(CI->getContext()); + Type *IntPtrTy = TD->getIntPtrType(CI->getType()); Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy, ConstantInt::get(IntPtrTy, TypeSize), NElems, 0, @@ -1734,7 +1539,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, // If this is a fixed size array, transform the Malloc to be an alloc of // structs. malloc [100 x struct],1 -> malloc struct, 100 if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) { - Type *IntPtrTy = TD->getIntPtrType(CI->getContext()); + Type *IntPtrTy = TD->getIntPtrType(CI->getType()); unsigned TypeSize = TD->getStructLayout(AllocSTy)->getSizeInBytes(); Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize); Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements()); @@ -1916,13 +1721,12 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV, if (!GV->hasLocalLinkage()) return false; - SmallPtrSet<const PHINode*, 16> PHIUsers; GlobalStatus GS; - if (AnalyzeGlobal(GV, GS, PHIUsers)) + if (GlobalStatus::analyzeGlobal(GV, GS)) return false; - if (!GS.isCompared && !GV->hasUnnamedAddr()) { + if (!GS.IsCompared && !GV->hasUnnamedAddr()) { GV->setUnnamedAddr(true); NumUnnamed++; } @@ -1930,19 +1734,17 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV, if (GV->isConstant() || !GV->hasInitializer()) return false; - return ProcessInternalGlobal(GV, GVI, PHIUsers, GS); + return ProcessInternalGlobal(GV, GVI, GS); } /// ProcessInternalGlobal - Analyze the specified global variable and optimize /// it if possible. If we make a change, return true. bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, Module::global_iterator &GVI, - const SmallPtrSet<const PHINode*, 16> &PHIUsers, const GlobalStatus &GS) { // If this is a first class global and has only one accessing function - // and this function is main (which we know is not recursive we can make - // this global a local variable) we replace the global with a local alloca - // in this function. + // and this function is main (which we know is not recursive), we replace + // the global with a local alloca in this function. // // NOTE: It doesn't make sense to promote non single-value types since we // are just replacing static memory to stack memory. @@ -1971,7 +1773,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, // If the global is never loaded (but may be stored to), it is dead. // Delete it now. - if (!GS.isLoaded) { + if (!GS.IsLoaded) { DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV); bool Changed; @@ -1992,7 +1794,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, } return Changed; - } else if (GS.StoredType <= GlobalStatus::isInitializerStored) { + } else if (GS.StoredType <= GlobalStatus::InitializerStored) { DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n"); GV->setConstant(true); @@ -2015,7 +1817,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, GVI = FirstNewGV; // Don't skip the newly produced globals! return true; } - } else if (GS.StoredType == GlobalStatus::isStoredOnce) { + } else if (GS.StoredType == GlobalStatus::StoredOnce) { // If the initial value for the global was an undef value, and if only // one other value was stored into it, we can just change the // initializer to be the stored value, then delete all stores to the @@ -2048,11 +1850,14 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, // Otherwise, if the global was not a boolean, we can shrink it to be a // boolean. - if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) - if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) { - ++NumShrunkToBool; - return true; + if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) { + if (GS.Ordering == NotAtomic) { + if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) { + ++NumShrunkToBool; + return true; + } } + } } return false; @@ -2210,8 +2015,7 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL, CSVals[1] = 0; StructType *StructTy = - cast <StructType>( - cast<ArrayType>(GCL->getType()->getElementType())->getElementType()); + cast<StructType>(GCL->getType()->getElementType()->getArrayElementType()); // Create the new init list. std::vector<Constant*> CAList; @@ -2784,7 +2588,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, Value *Ptr = PtrArg->stripPointerCasts(); if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) { Type *ElemTy = cast<PointerType>(GV->getType())->getElementType(); - if (!Size->isAllOnesValue() && + if (TD && !Size->isAllOnesValue() && Size->getValue().getLimitedValue() >= TD->getTypeStoreSize(ElemTy)) { Invariants.insert(GV); @@ -3041,107 +2845,148 @@ bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) { return true; } -static Value::use_iterator getFirst(Value *V, SmallPtrSet<Use*, 8> &Tried) { - for (Value::use_iterator I = V->use_begin(), E = V->use_end(); I != E; ++I) { - Use *U = &I.getUse(); - if (Tried.count(U)) - continue; - - User *Usr = *I; - GlobalVariable *GV = dyn_cast<GlobalVariable>(Usr); - if (!GV || !GV->hasName()) { - Tried.insert(U); - return I; - } - - StringRef Name = GV->getName(); - if (Name != "llvm.used" && Name != "llvm.compiler_used") { - Tried.insert(U); - return I; - } - } - return V->use_end(); +static int compareNames(Constant *const *A, Constant *const *B) { + return (*A)->getName().compare((*B)->getName()); } -static bool replaceAllNonLLVMUsedUsesWith(Constant *Old, Constant *New); - -static bool replaceUsesOfWithOnConstant(ConstantArray *CA, Value *From, - Value *ToV, Use *U) { - Constant *To = cast<Constant>(ToV); - - SmallVector<Constant*, 8> NewOps; - for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i) { - Constant *Op = CA->getOperand(i); - NewOps.push_back(Op == From ? To : Op); +static void setUsedInitializer(GlobalVariable &V, + SmallPtrSet<GlobalValue *, 8> Init) { + if (Init.empty()) { + V.eraseFromParent(); + return; } - Constant *Replacement = ConstantArray::get(CA->getType(), NewOps); - assert(Replacement != CA && "CA didn't contain From!"); + SmallVector<llvm::Constant *, 8> UsedArray; + PointerType *Int8PtrTy = Type::getInt8PtrTy(V.getContext()); - bool Ret = replaceAllNonLLVMUsedUsesWith(CA, Replacement); - if (Replacement->use_empty()) - Replacement->destroyConstant(); - if (CA->use_empty()) - CA->destroyConstant(); - return Ret; + for (SmallPtrSet<GlobalValue *, 8>::iterator I = Init.begin(), E = Init.end(); + I != E; ++I) { + Constant *Cast = llvm::ConstantExpr::getBitCast(*I, Int8PtrTy); + UsedArray.push_back(Cast); + } + // Sort to get deterministic order. + array_pod_sort(UsedArray.begin(), UsedArray.end(), compareNames); + ArrayType *ATy = ArrayType::get(Int8PtrTy, UsedArray.size()); + + Module *M = V.getParent(); + V.removeFromParent(); + GlobalVariable *NV = + new GlobalVariable(*M, ATy, false, llvm::GlobalValue::AppendingLinkage, + llvm::ConstantArray::get(ATy, UsedArray), ""); + NV->takeName(&V); + NV->setSection("llvm.metadata"); + delete &V; } -static bool replaceUsesOfWithOnConstant(ConstantExpr *CE, Value *From, - Value *ToV, Use *U) { - Constant *To = cast<Constant>(ToV); - SmallVector<Constant*, 8> NewOps; - for (unsigned i = 0, e = CE->getNumOperands(); i != e; ++i) { - Constant *Op = CE->getOperand(i); - NewOps.push_back(Op == From ? To : Op); +namespace { +/// \brief An easy to access representation of llvm.used and llvm.compiler.used. +class LLVMUsed { + SmallPtrSet<GlobalValue *, 8> Used; + SmallPtrSet<GlobalValue *, 8> CompilerUsed; + GlobalVariable *UsedV; + GlobalVariable *CompilerUsedV; + +public: + LLVMUsed(Module &M) { + UsedV = collectUsedGlobalVariables(M, Used, false); + CompilerUsedV = collectUsedGlobalVariables(M, CompilerUsed, true); + } + typedef SmallPtrSet<GlobalValue *, 8>::iterator iterator; + iterator usedBegin() { return Used.begin(); } + iterator usedEnd() { return Used.end(); } + iterator compilerUsedBegin() { return CompilerUsed.begin(); } + iterator compilerUsedEnd() { return CompilerUsed.end(); } + bool usedCount(GlobalValue *GV) const { return Used.count(GV); } + bool compilerUsedCount(GlobalValue *GV) const { + return CompilerUsed.count(GV); + } + bool usedErase(GlobalValue *GV) { return Used.erase(GV); } + bool compilerUsedErase(GlobalValue *GV) { return CompilerUsed.erase(GV); } + bool usedInsert(GlobalValue *GV) { return Used.insert(GV); } + bool compilerUsedInsert(GlobalValue *GV) { return CompilerUsed.insert(GV); } + + void syncVariablesAndSets() { + if (UsedV) + setUsedInitializer(*UsedV, Used); + if (CompilerUsedV) + setUsedInitializer(*CompilerUsedV, CompilerUsed); } +}; +} - Constant *Replacement = CE->getWithOperands(NewOps); - assert(Replacement != CE && "CE didn't contain From!"); +static bool hasUseOtherThanLLVMUsed(GlobalAlias &GA, const LLVMUsed &U) { + if (GA.use_empty()) // No use at all. + return false; - bool Ret = replaceAllNonLLVMUsedUsesWith(CE, Replacement); - if (Replacement->use_empty()) - Replacement->destroyConstant(); - if (CE->use_empty()) - CE->destroyConstant(); - return Ret; + assert((!U.usedCount(&GA) || !U.compilerUsedCount(&GA)) && + "We should have removed the duplicated " + "element from llvm.compiler.used"); + if (!GA.hasOneUse()) + // Strictly more than one use. So at least one is not in llvm.used and + // llvm.compiler.used. + return true; + + // Exactly one use. Check if it is in llvm.used or llvm.compiler.used. + return !U.usedCount(&GA) && !U.compilerUsedCount(&GA); } -static bool replaceUsesOfWithOnConstant(Constant *C, Value *From, Value *To, - Use *U) { - if (ConstantArray *CA = dyn_cast<ConstantArray>(C)) - return replaceUsesOfWithOnConstant(CA, From, To, U); - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) - return replaceUsesOfWithOnConstant(CE, From, To, U); - C->replaceUsesOfWithOnConstant(From, To, U); - return true; +static bool hasMoreThanOneUseOtherThanLLVMUsed(GlobalValue &V, + const LLVMUsed &U) { + unsigned N = 2; + assert((!U.usedCount(&V) || !U.compilerUsedCount(&V)) && + "We should have removed the duplicated " + "element from llvm.compiler.used"); + if (U.usedCount(&V) || U.compilerUsedCount(&V)) + ++N; + return V.hasNUsesOrMore(N); } -static bool replaceAllNonLLVMUsedUsesWith(Constant *Old, Constant *New) { - SmallPtrSet<Use*, 8> Tried; - bool Ret = false; - for (;;) { - Value::use_iterator I = getFirst(Old, Tried); - if (I == Old->use_end()) - break; - Use &U = I.getUse(); +static bool mayHaveOtherReferences(GlobalAlias &GA, const LLVMUsed &U) { + if (!GA.hasLocalLinkage()) + return true; - // Must handle Constants specially, we cannot call replaceUsesOfWith on a - // constant because they are uniqued. - if (Constant *C = dyn_cast<Constant>(U.getUser())) { - if (!isa<GlobalValue>(C)) { - Ret |= replaceUsesOfWithOnConstant(C, Old, New, &U); - continue; - } - } + return U.usedCount(&GA) || U.compilerUsedCount(&GA); +} - U.set(New); +static bool hasUsesToReplace(GlobalAlias &GA, LLVMUsed &U, bool &RenameTarget) { + RenameTarget = false; + bool Ret = false; + if (hasUseOtherThanLLVMUsed(GA, U)) Ret = true; - } - return Ret; + + // If the alias is externally visible, we may still be able to simplify it. + if (!mayHaveOtherReferences(GA, U)) + return Ret; + + // If the aliasee has internal linkage, give it the name and linkage + // of the alias, and delete the alias. This turns: + // define internal ... @f(...) + // @a = alias ... @f + // into: + // define ... @a(...) + Constant *Aliasee = GA.getAliasee(); + GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts()); + if (!Target->hasLocalLinkage()) + return Ret; + + // Do not perform the transform if multiple aliases potentially target the + // aliasee. This check also ensures that it is safe to replace the section + // and other attributes of the aliasee with those of the alias. + if (hasMoreThanOneUseOtherThanLLVMUsed(*Target, U)) + return Ret; + + RenameTarget = true; + return true; } bool GlobalOpt::OptimizeGlobalAliases(Module &M) { bool Changed = false; + LLVMUsed Used(M); + + for (SmallPtrSet<GlobalValue *, 8>::iterator I = Used.usedBegin(), + E = Used.usedEnd(); + I != E; ++I) + Used.compilerUsedErase(*I); for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E;) { @@ -3156,38 +3001,29 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) { Constant *Aliasee = J->getAliasee(); GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts()); Target->removeDeadConstantUsers(); - bool hasOneUse = Target->hasOneUse() && Aliasee->hasOneUse(); // Make all users of the alias use the aliasee instead. - if (replaceAllNonLLVMUsedUsesWith(J, Aliasee)) { - ++NumAliasesResolved; - Changed = true; - } - if (!J->use_empty()) + bool RenameTarget; + if (!hasUsesToReplace(*J, Used, RenameTarget)) continue; - // If the alias is externally visible, we may still be able to simplify it. - if (!J->hasLocalLinkage()) { - // If the aliasee has internal linkage, give it the name and linkage - // of the alias, and delete the alias. This turns: - // define internal ... @f(...) - // @a = alias ... @f - // into: - // define ... @a(...) - if (!Target->hasLocalLinkage()) - continue; - - // Do not perform the transform if multiple aliases potentially target the - // aliasee. This check also ensures that it is safe to replace the section - // and other attributes of the aliasee with those of the alias. - if (!hasOneUse) - continue; + J->replaceAllUsesWith(Aliasee); + ++NumAliasesResolved; + Changed = true; + if (RenameTarget) { // Give the aliasee the name, linkage and other attributes of the alias. Target->takeName(J); Target->setLinkage(J->getLinkage()); Target->GlobalValue::copyAttributesFrom(J); - } + + if (Used.usedErase(J)) + Used.usedInsert(Target); + + if (Used.compilerUsedErase(J)) + Used.compilerUsedInsert(Target); + } else if (mayHaveOtherReferences(*J, Used)) + continue; // Delete the alias. M.getAliasList().erase(J); @@ -3195,6 +3031,8 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) { Changed = true; } + Used.syncVariablesAndSets(); + return Changed; } @@ -3323,8 +3161,6 @@ bool GlobalOpt::runOnModule(Module &M) { // Try to find the llvm.globalctors list. GlobalVariable *GlobalCtors = FindGlobalCtors(M); - Function *CXAAtExitFn = FindCXAAtExit(M, TLI); - bool LocalChange = true; while (LocalChange) { LocalChange = false; @@ -3342,7 +3178,9 @@ bool GlobalOpt::runOnModule(Module &M) { // Resolve aliases, when possible. LocalChange |= OptimizeGlobalAliases(M); - // Try to remove trivial global destructors. + // Try to remove trivial global destructors if they are not removed + // already. + Function *CXAAtExitFn = FindCXAAtExit(M, TLI); if (CXAAtExitFn) LocalChange |= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn); diff --git a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp index a0095da..437597e 100644 --- a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp +++ b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp @@ -63,7 +63,7 @@ public: char AlwaysInliner::ID = 0; INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline", "Inliner for always_inline functions", false, false) -INITIALIZE_AG_DEPENDENCY(CallGraph) +INITIALIZE_PASS_DEPENDENCY(CallGraph) INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis) INITIALIZE_PASS_END(AlwaysInliner, "always-inline", "Inliner for always_inline functions", false, false) diff --git a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp index a4f7026..57379a3 100644 --- a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp +++ b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp @@ -28,7 +28,7 @@ using namespace llvm; namespace { -/// \brief Actaul inliner pass implementation. +/// \brief Actual inliner pass implementation. /// /// The common implementation of the inlining logic is shared between this /// inliner pass and the always inliner pass. The two passes use different cost @@ -61,7 +61,7 @@ public: char SimpleInliner::ID = 0; INITIALIZE_PASS_BEGIN(SimpleInliner, "inline", "Function Integration/Inlining", false, false) -INITIALIZE_AG_DEPENDENCY(CallGraph) +INITIALIZE_PASS_DEPENDENCY(CallGraph) INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis) INITIALIZE_PASS_END(SimpleInliner, "inline", "Function Integration/Inlining", false, false) diff --git a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp index 663ddb7..d75d6ca 100644 --- a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp @@ -116,7 +116,8 @@ static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) { /// any new allocas to the set if not possible. static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, InlinedArrayAllocasTy &InlinedArrayAllocas, - int InlineHistory, bool InsertLifetime) { + int InlineHistory, bool InsertLifetime, + const DataLayout *TD) { Function *Callee = CS.getCalledFunction(); Function *Caller = CS.getCaller(); @@ -189,6 +190,14 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, bool MergedAwayAlloca = false; for (unsigned i = 0, e = AllocasForType.size(); i != e; ++i) { AllocaInst *AvailableAlloca = AllocasForType[i]; + + unsigned Align1 = AI->getAlignment(), + Align2 = AvailableAlloca->getAlignment(); + // If we don't have data layout information, and only one alloca is using + // the target default, then we can't safely merge them because we can't + // pick the greater alignment. + if (!TD && (!Align1 || !Align2) && Align1 != Align2) + continue; // The available alloca has to be in the right function, not in some other // function in this SCC. @@ -206,6 +215,20 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, << *AvailableAlloca << '\n'); AI->replaceAllUsesWith(AvailableAlloca); + + if (Align1 != Align2) { + if (!Align1 || !Align2) { + assert(TD && "DataLayout required to compare default alignments"); + unsigned TypeAlign = TD->getABITypeAlignment(AI->getAllocatedType()); + + Align1 = Align1 ? Align1 : TypeAlign; + Align2 = Align2 ? Align2 : TypeAlign; + } + + if (Align1 > Align2) + AvailableAlloca->setAlignment(AI->getAlignment()); + } + AI->eraseFromParent(); MergedAwayAlloca = true; ++NumMergedAllocas; @@ -482,7 +505,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { // Attempt to inline the function. if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas, - InlineHistoryID, InsertLifetime)) + InlineHistoryID, InsertLifetime, TD)) continue; ++NumInlined; diff --git a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp index 4bfab5b..64e2ced 100644 --- a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp +++ b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp @@ -11,10 +11,17 @@ // If the function or variable is not in the list of external names given to // the pass it is marked as internal. // +// This transformation would not be legal in a regular compilation, but it gets +// extra information from the linker about what is safe. +// +// For example: Internalizing a function with external linkage. Only if we are +// told it is only used from within this module, it is safe to do it. +// //===----------------------------------------------------------------------===// #define DEBUG_TYPE "internalize" #include "llvm/Transforms/IPO.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/IR/Module.h" @@ -22,6 +29,8 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/GlobalStatus.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" #include <fstream> #include <set> using namespace llvm; @@ -48,10 +57,8 @@ namespace { public: static char ID; // Pass identification, replacement for typeid explicit InternalizePass(); - explicit InternalizePass(ArrayRef<const char *> exportList); + explicit InternalizePass(ArrayRef<const char *> ExportList); void LoadFile(const char *Filename); - void ClearExportList(); - void AddToExportList(const std::string &val); virtual bool runOnModule(Module &M); virtual void getAnalysisUsage(AnalysisUsage &AU) const { @@ -70,15 +77,14 @@ InternalizePass::InternalizePass() initializeInternalizePassPass(*PassRegistry::getPassRegistry()); if (!APIFile.empty()) // If a filename is specified, use it. LoadFile(APIFile.c_str()); - if (!APIList.empty()) // If a list is specified, use it as well. - ExternalNames.insert(APIList.begin(), APIList.end()); + ExternalNames.insert(APIList.begin(), APIList.end()); } -InternalizePass::InternalizePass(ArrayRef<const char *> exportList) +InternalizePass::InternalizePass(ArrayRef<const char *> ExportList) : ModulePass(ID){ initializeInternalizePassPass(*PassRegistry::getPassRegistry()); - for(ArrayRef<const char *>::const_iterator itr = exportList.begin(); - itr != exportList.end(); itr++) { + for(ArrayRef<const char *>::const_iterator itr = ExportList.begin(); + itr != ExportList.end(); itr++) { ExternalNames.insert(*itr); } } @@ -99,12 +105,25 @@ void InternalizePass::LoadFile(const char *Filename) { } } -void InternalizePass::ClearExportList() { - ExternalNames.clear(); -} +static bool shouldInternalize(const GlobalValue &GV, + const std::set<std::string> &ExternalNames) { + // Function must be defined here + if (GV.isDeclaration()) + return false; + + // Available externally is really just a "declaration with a body". + if (GV.hasAvailableExternallyLinkage()) + return false; + + // Already has internal linkage + if (GV.hasLocalLinkage()) + return false; + + // Marked to keep external? + if (ExternalNames.count(GV.getName())) + return false; -void InternalizePass::AddToExportList(const std::string &val) { - ExternalNames.insert(val); + return true; } bool InternalizePass::runOnModule(Module &M) { @@ -112,26 +131,40 @@ bool InternalizePass::runOnModule(Module &M) { CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : 0; bool Changed = false; - // Never internalize functions which code-gen might insert. - // FIXME: We should probably add this (and the __stack_chk_guard) via some - // type of call-back in CodeGen. - ExternalNames.insert("__stack_chk_fail"); + SmallPtrSet<GlobalValue *, 8> Used; + collectUsedGlobalVariables(M, Used, false); + + // We must assume that globals in llvm.used have a reference that not even + // the linker can see, so we don't internalize them. + // For llvm.compiler.used the situation is a bit fuzzy. The assembler and + // linker can drop those symbols. If this pass is running as part of LTO, + // one might think that it could just drop llvm.compiler.used. The problem + // is that even in LTO llvm doesn't see every reference. For example, + // we don't see references from function local inline assembly. To be + // conservative, we internalize symbols in llvm.compiler.used, but we + // keep llvm.compiler.used so that the symbol is not deleted by llvm. + for (SmallPtrSet<GlobalValue *, 8>::iterator I = Used.begin(), E = Used.end(); + I != E; ++I) { + GlobalValue *V = *I; + ExternalNames.insert(V->getName()); + } // Mark all functions not in the api as internal. // FIXME: maybe use private linkage? - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) - if (!I->isDeclaration() && // Function must be defined here - // Available externally is really just a "declaration with a body". - !I->hasAvailableExternallyLinkage() && - !I->hasLocalLinkage() && // Can't already have internal linkage - !ExternalNames.count(I->getName())) {// Not marked to keep external? - I->setLinkage(GlobalValue::InternalLinkage); + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + if (!shouldInternalize(*I, ExternalNames)) + continue; + + I->setLinkage(GlobalValue::InternalLinkage); + + if (ExternalNode) // Remove a callgraph edge from the external node to this function. - if (ExternalNode) ExternalNode->removeOneAbstractEdgeTo((*CG)[I]); - Changed = true; - ++NumFunctions; - DEBUG(dbgs() << "Internalizing func " << I->getName() << "\n"); - } + ExternalNode->removeOneAbstractEdgeTo((*CG)[I]); + + Changed = true; + ++NumFunctions; + DEBUG(dbgs() << "Internalizing func " << I->getName() << "\n"); + } // Never internalize the llvm.used symbol. It is used to implement // attribute((used)). @@ -146,35 +179,36 @@ bool InternalizePass::runOnModule(Module &M) { ExternalNames.insert("llvm.global.annotations"); // Never internalize symbols code-gen inserts. + // FIXME: We should probably add this (and the __stack_chk_guard) via some + // type of call-back in CodeGen. + ExternalNames.insert("__stack_chk_fail"); ExternalNames.insert("__stack_chk_guard"); // Mark all global variables with initializers that are not in the api as // internal as well. // FIXME: maybe use private linkage? for (Module::global_iterator I = M.global_begin(), E = M.global_end(); - I != E; ++I) - if (!I->isDeclaration() && !I->hasLocalLinkage() && - // Available externally is really just a "declaration with a body". - !I->hasAvailableExternallyLinkage() && - !ExternalNames.count(I->getName())) { - I->setLinkage(GlobalValue::InternalLinkage); - Changed = true; - ++NumGlobals; - DEBUG(dbgs() << "Internalized gvar " << I->getName() << "\n"); - } + I != E; ++I) { + if (!shouldInternalize(*I, ExternalNames)) + continue; + + I->setLinkage(GlobalValue::InternalLinkage); + Changed = true; + ++NumGlobals; + DEBUG(dbgs() << "Internalized gvar " << I->getName() << "\n"); + } // Mark all aliases that are not in the api as internal as well. for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); - I != E; ++I) - if (!I->isDeclaration() && !I->hasInternalLinkage() && - // Available externally is really just a "declaration with a body". - !I->hasAvailableExternallyLinkage() && - !ExternalNames.count(I->getName())) { - I->setLinkage(GlobalValue::InternalLinkage); - Changed = true; - ++NumAliases; - DEBUG(dbgs() << "Internalized alias " << I->getName() << "\n"); - } + I != E; ++I) { + if (!shouldInternalize(*I, ExternalNames)) + continue; + + I->setLinkage(GlobalValue::InternalLinkage); + Changed = true; + ++NumAliases; + DEBUG(dbgs() << "Internalized alias " << I->getName() << "\n"); + } return Changed; } @@ -183,6 +217,6 @@ ModulePass *llvm::createInternalizePass() { return new InternalizePass(); } -ModulePass *llvm::createInternalizePass(ArrayRef<const char *> el) { - return new InternalizePass(el); +ModulePass *llvm::createInternalizePass(ArrayRef<const char *> ExportList) { + return new InternalizePass(ExportList); } diff --git a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp index 4ce749c..3861421 100644 --- a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp +++ b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp @@ -210,16 +210,20 @@ private: // Any two pointers in the same address space are equivalent, intptr_t and // pointers are equivalent. Otherwise, standard type equivalence rules apply. bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const { + + PointerType *PTy1 = dyn_cast<PointerType>(Ty1); + PointerType *PTy2 = dyn_cast<PointerType>(Ty2); + + if (TD) { + if (PTy1 && PTy1->getAddressSpace() == 0) Ty1 = TD->getIntPtrType(Ty1); + if (PTy2 && PTy2->getAddressSpace() == 0) Ty2 = TD->getIntPtrType(Ty2); + } + if (Ty1 == Ty2) return true; - if (Ty1->getTypeID() != Ty2->getTypeID()) { - if (TD) { - LLVMContext &Ctx = Ty1->getContext(); - if (isa<PointerType>(Ty1) && Ty2 == TD->getIntPtrType(Ctx)) return true; - if (isa<PointerType>(Ty2) && Ty1 == TD->getIntPtrType(Ctx)) return true; - } + + if (Ty1->getTypeID() != Ty2->getTypeID()) return false; - } switch (Ty1->getTypeID()) { default: @@ -241,8 +245,7 @@ bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const { return true; case Type::PointerTyID: { - PointerType *PTy1 = cast<PointerType>(Ty1); - PointerType *PTy2 = cast<PointerType>(Ty2); + assert(PTy1 && PTy2 && "Both types must be pointers here."); return PTy1->getAddressSpace() == PTy2->getAddressSpace(); } @@ -352,14 +355,19 @@ bool FunctionComparator::isEquivalentOperation(const Instruction *I1, // Determine whether two GEP operations perform the same underlying arithmetic. bool FunctionComparator::isEquivalentGEP(const GEPOperator *GEP1, const GEPOperator *GEP2) { - // When we have target data, we can reduce the GEP down to the value in bytes - // added to the address. - unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 1; - APInt Offset1(BitWidth, 0), Offset2(BitWidth, 0); - if (TD && - GEP1->accumulateConstantOffset(*TD, Offset1) && - GEP2->accumulateConstantOffset(*TD, Offset2)) { - return Offset1 == Offset2; + unsigned AS = GEP1->getPointerAddressSpace(); + if (AS != GEP2->getPointerAddressSpace()) + return false; + + if (TD) { + // When we have target data, we can reduce the GEP down to the value in bytes + // added to the address. + unsigned BitWidth = TD ? TD->getPointerSizeInBits(AS) : 1; + APInt Offset1(BitWidth, 0), Offset2(BitWidth, 0); + if (GEP1->accumulateConstantOffset(*TD, Offset1) && + GEP2->accumulateConstantOffset(*TD, Offset2)) { + return Offset1 == Offset2; + } } if (GEP1->getPointerOperand()->getType() != @@ -713,6 +721,19 @@ void MergeFunctions::writeThunkOrAlias(Function *F, Function *G) { writeThunk(F, G); } +// Helper for writeThunk, +// Selects proper bitcast operation, +// but a bit simplier then CastInst::getCastOpcode. +static Value* createCast(IRBuilder<false> &Builder, Value *V, Type *DestTy) { + Type *SrcTy = V->getType(); + if (SrcTy->isIntegerTy() && DestTy->isPointerTy()) + return Builder.CreateIntToPtr(V, DestTy); + else if (SrcTy->isPointerTy() && DestTy->isIntegerTy()) + return Builder.CreatePtrToInt(V, DestTy); + else + return Builder.CreateBitCast(V, DestTy); +} + // Replace G with a simple tail call to bitcast(F). Also replace direct uses // of G with bitcast(F). Deletes G. void MergeFunctions::writeThunk(Function *F, Function *G) { @@ -738,7 +759,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) { FunctionType *FFTy = F->getFunctionType(); for (Function::arg_iterator AI = NewG->arg_begin(), AE = NewG->arg_end(); AI != AE; ++AI) { - Args.push_back(Builder.CreateBitCast(AI, FFTy->getParamType(i))); + Args.push_back(createCast(Builder, (Value*)AI, FFTy->getParamType(i))); ++i; } @@ -748,13 +769,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) { if (NewG->getReturnType()->isVoidTy()) { Builder.CreateRetVoid(); } else { - Type *RetTy = NewG->getReturnType(); - if (CI->getType()->isIntegerTy() && RetTy->isPointerTy()) - Builder.CreateRet(Builder.CreateIntToPtr(CI, RetTy)); - else if (CI->getType()->isPointerTy() && RetTy->isIntegerTy()) - Builder.CreateRet(Builder.CreatePtrToInt(CI, RetTy)); - else - Builder.CreateRet(Builder.CreateBitCast(CI, RetTy)); + Builder.CreateRet(createCast(Builder, CI, NewG->getReturnType())); } NewG->copyAttributesFrom(G); @@ -829,6 +844,18 @@ bool MergeFunctions::insert(ComparableFunction &NewF) { const ComparableFunction &OldF = *Result.first; + // Don't merge tiny functions, since it can just end up making the function + // larger. + // FIXME: Should still merge them if they are unnamed_addr and produce an + // alias. + if (NewF.getFunc()->size() == 1) { + if (NewF.getFunc()->front().size() <= 2) { + DEBUG(dbgs() << NewF.getFunc()->getName() + << " is to small to bother merging\n"); + return false; + } + } + // Never thunk a strong function to a weak function. assert(!OldF.getFunc()->mayBeOverridden() || NewF.getFunc()->mayBeOverridden()); diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 986c0b8..24c5018 100644 --- a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -29,15 +29,20 @@ using namespace llvm; static cl::opt<bool> -RunLoopVectorization("vectorize-loops", +RunLoopVectorization("vectorize-loops", cl::Hidden, cl::desc("Run the Loop vectorization passes")); static cl::opt<bool> -RunSLPVectorization("vectorize-slp", +LateVectorization("late-vectorize", cl::init(true), cl::Hidden, + cl::desc("Run the vectorization pasess late in the pass " + "pipeline (after the inliner)")); + +static cl::opt<bool> +RunSLPVectorization("vectorize-slp", cl::Hidden, cl::desc("Run the SLP vectorization passes")); static cl::opt<bool> -RunBBVectorization("vectorize-slp-aggressive", +RunBBVectorization("vectorize-slp-aggressive", cl::Hidden, cl::desc("Run the BB vectorization passes")); static cl::opt<bool> @@ -49,17 +54,22 @@ static cl::opt<bool> UseNewSROA("use-new-sroa", cl::init(true), cl::Hidden, cl::desc("Enable the new, experimental SROA pass")); +static cl::opt<bool> +RunLoopRerolling("reroll-loops", cl::Hidden, + cl::desc("Run the loop rerolling pass")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; LibraryInfo = 0; Inliner = 0; - DisableSimplifyLibCalls = false; DisableUnitAtATime = false; DisableUnrollLoops = false; BBVectorize = RunBBVectorization; SLPVectorize = RunSLPVectorization; LoopVectorize = RunLoopVectorization; + LateVectorize = LateVectorization; + RerollLoops = RunLoopRerolling; } PassManagerBuilder::~PassManagerBuilder() { @@ -174,8 +184,6 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { else MPM.add(createScalarReplAggregatesPass(-1, false)); MPM.add(createEarlyCSEPass()); // Catch trivial redundancies - if (!DisableSimplifyLibCalls) - MPM.add(createSimplifyLibCallsPass()); // Library Call Optimizations MPM.add(createJumpThreadingPass()); // Thread jumps. MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals MPM.add(createCFGSimplificationPass()); // Merge & remove BBs @@ -192,8 +200,8 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. MPM.add(createLoopDeletionPass()); // Delete dead loops - if (LoopVectorize && OptLevel > 2) - MPM.add(createLoopVectorizePass()); + if (!LateVectorize && LoopVectorize) + MPM.add(createLoopVectorizePass(DisableUnrollLoops)); if (!DisableUnrollLoops) MPM.add(createLoopUnrollPass()); // Unroll small loops @@ -213,16 +221,18 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { addExtensionsToPM(EP_ScalarOptimizerLate, MPM); + if (RerollLoops) + MPM.add(createLoopRerollPass()); if (SLPVectorize) - MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. + MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. if (BBVectorize) { MPM.add(createBBVectorizePass()); MPM.add(createInstructionCombiningPass()); if (OptLevel > 1 && UseGVNAfterVectorization) - MPM.add(createGVNPass()); // Remove redundancies + MPM.add(createGVNPass()); // Remove redundancies else - MPM.add(createEarlyCSEPass()); // Catch trivial redundancies + MPM.add(createEarlyCSEPass()); // Catch trivial redundancies // BBVectorize may have significantly shortened a loop body; unroll again. if (!DisableUnrollLoops) @@ -230,9 +240,25 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { } MPM.add(createAggressiveDCEPass()); // Delete dead instructions - MPM.add(createCFGSimplificationPass()); // Merge & remove BBs + MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createInstructionCombiningPass()); // Clean up after everything. + // As an experimental mode, run any vectorization passes in a separate + // pipeline from the CGSCC pass manager that runs iteratively with the + // inliner. + if (LateVectorize && LoopVectorize) { + // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC + // pass manager that we are specifically trying to avoid. To prevent this + // we must insert a no-op module pass to reset the pass manager. + MPM.add(createBarrierNoopPass()); + + // Add the various vectorization passes and relevant cleanup passes for + // them since we are no longer in the middle of the main scalar pipeline. + MPM.add(createLoopVectorizePass(DisableUnrollLoops)); + MPM.add(createInstructionCombiningPass()); + MPM.add(createCFGSimplificationPass()); + } + if (!DisableUnitAtATime) { // FIXME: We shouldn't bother with this anymore. MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes @@ -257,11 +283,8 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, // Now that composite has been compiled, scan through the module, looking // for a main function. If main is defined, mark all other functions // internal. - if (Internalize) { - std::vector<const char*> E; - E.push_back("main"); - PM.add(createInternalizePass(E)); - } + if (Internalize) + PM.add(createInternalizePass("main")); // Propagate constants at call sites into the functions they call. This // opens opportunities for globalopt (and inlining) by substituting function @@ -302,6 +325,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, // The IPO passes may leave cruft around. Clean up after them. PM.add(createInstructionCombiningPass()); PM.add(createJumpThreadingPass()); + // Break up allocas if (UseNewSROA) PM.add(createSROAPass()); @@ -315,6 +339,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, PM.add(createLICMPass()); // Hoist loop invariants. PM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies. PM.add(createMemCpyOptPass()); // Remove dead memcpys. + // Nuke dead stores. PM.add(createDeadStoreEliminationPass()); @@ -379,8 +404,7 @@ LLVMPassManagerBuilderSetDisableUnrollLoops(LLVMPassManagerBuilderRef PMB, void LLVMPassManagerBuilderSetDisableSimplifyLibCalls(LLVMPassManagerBuilderRef PMB, LLVMBool Value) { - PassManagerBuilder *Builder = unwrap(PMB); - Builder->DisableSimplifyLibCalls = Value; + // NOTE: The simplify-libcalls pass has been removed. } void diff --git a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp index 73d9323..b160913 100644 --- a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp @@ -51,7 +51,7 @@ namespace { char PruneEH::ID = 0; INITIALIZE_PASS_BEGIN(PruneEH, "prune-eh", "Remove unused exception handling info", false, false) -INITIALIZE_AG_DEPENDENCY(CallGraph) +INITIALIZE_PASS_DEPENDENCY(CallGraph) INITIALIZE_PASS_END(PruneEH, "prune-eh", "Remove unused exception handling info", false, false) @@ -145,15 +145,13 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) { NewAttributes.addAttribute(Attribute::NoReturn); Function *F = (*I)->getFunction(); - const AttributeSet &PAL = F->getAttributes(); - const AttributeSet &NPAL = - PAL.addAttributes(F->getContext(), AttributeSet::FunctionIndex, - AttributeSet::get(F->getContext(), - AttributeSet::FunctionIndex, - NewAttributes)); + const AttributeSet &PAL = F->getAttributes().getFnAttributes(); + const AttributeSet &NPAL = AttributeSet::get( + F->getContext(), AttributeSet::FunctionIndex, NewAttributes); + if (PAL != NPAL) { MadeChange = true; - F->setAttributes(NPAL); + F->addAttributes(AttributeSet::FunctionIndex, NPAL); } } diff --git a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp index 3396f79..c4f5cfc 100644 --- a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp +++ b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp @@ -9,7 +9,7 @@ // // The StripSymbols transformation implements code stripping. Specifically, it // can delete: -// +// // * names for virtual registers // * symbols for internal globals and functions // * debug information @@ -39,7 +39,7 @@ namespace { bool OnlyDebugInfo; public: static char ID; // Pass identification, replacement for typeid - explicit StripSymbols(bool ODI = false) + explicit StripSymbols(bool ODI = false) : ModulePass(ID), OnlyDebugInfo(ODI) { initializeStripSymbolsPass(*PassRegistry::getPassRegistry()); } @@ -144,7 +144,7 @@ static void RemoveDeadConstant(Constant *C) { assert(C->use_empty() && "Constant is not dead!"); SmallPtrSet<Constant*, 4> Operands; for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) - if (OnlyUsedBy(C->getOperand(i), C)) + if (OnlyUsedBy(C->getOperand(i), C)) Operands.insert(cast<Constant>(C->getOperand(i))); if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) { if (!GV->hasLocalLinkage()) return; // Don't delete non static globals. @@ -182,7 +182,7 @@ static void StripTypeNames(Module &M, bool PreserveDbgInfo) { for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) { StructType *STy = StructTypes[i]; if (STy->isLiteral() || STy->getName().empty()) continue; - + if (PreserveDbgInfo && STy->getName().startswith("llvm.dbg")) continue; @@ -199,7 +199,7 @@ static void findUsedValues(GlobalVariable *LLVMUsed, ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer()); for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) - if (GlobalValue *GV = + if (GlobalValue *GV = dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts())) UsedValues.insert(GV); } @@ -217,71 +217,20 @@ static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) { if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg")) I->setName(""); // Internal symbols can't participate in linkage } - + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0) if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg")) I->setName(""); // Internal symbols can't participate in linkage StripSymtab(I->getValueSymbolTable(), PreserveDbgInfo); } - + // Remove all names from types. StripTypeNames(M, PreserveDbgInfo); return true; } -// StripDebugInfo - Strip debug info in the module if it exists. -// To do this, we remove llvm.dbg.func.start, llvm.dbg.stoppoint, and -// llvm.dbg.region.end calls, and any globals they point to if now dead. -static bool StripDebugInfo(Module &M) { - - bool Changed = false; - - // Remove all of the calls to the debugger intrinsics, and remove them from - // the module. - if (Function *Declare = M.getFunction("llvm.dbg.declare")) { - while (!Declare->use_empty()) { - CallInst *CI = cast<CallInst>(Declare->use_back()); - CI->eraseFromParent(); - } - Declare->eraseFromParent(); - Changed = true; - } - - if (Function *DbgVal = M.getFunction("llvm.dbg.value")) { - while (!DbgVal->use_empty()) { - CallInst *CI = cast<CallInst>(DbgVal->use_back()); - CI->eraseFromParent(); - } - DbgVal->eraseFromParent(); - Changed = true; - } - - for (Module::named_metadata_iterator NMI = M.named_metadata_begin(), - NME = M.named_metadata_end(); NMI != NME;) { - NamedMDNode *NMD = NMI; - ++NMI; - if (NMD->getName().startswith("llvm.dbg.")) { - NMD->eraseFromParent(); - Changed = true; - } - } - - for (Module::iterator MI = M.begin(), ME = M.end(); MI != ME; ++MI) - for (Function::iterator FI = MI->begin(), FE = MI->end(); FI != FE; - ++FI) - for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; - ++BI) { - if (!BI->getDebugLoc().isUnknown()) { - Changed = true; - BI->setDebugLoc(DebugLoc()); - } - } - - return Changed; -} - bool StripSymbols::runOnModule(Module &M) { bool Changed = false; Changed |= StripDebugInfo(M); @@ -307,13 +256,13 @@ bool StripDebugDeclare::runOnModule(Module &M) { assert(CI->use_empty() && "llvm.dbg intrinsic should have void result"); CI->eraseFromParent(); if (Arg1->use_empty()) { - if (Constant *C = dyn_cast<Constant>(Arg1)) + if (Constant *C = dyn_cast<Constant>(Arg1)) DeadConstants.push_back(C); - else + else RecursivelyDeleteTriviallyDeadInstructions(Arg1); } if (Arg2->use_empty()) - if (Constant *C = dyn_cast<Constant>(Arg2)) + if (Constant *C = dyn_cast<Constant>(Arg2)) DeadConstants.push_back(C); } Declare->eraseFromParent(); @@ -332,81 +281,107 @@ bool StripDebugDeclare::runOnModule(Module &M) { return true; } -/// getRealLinkageName - If special LLVM prefix that is used to inform the asm -/// printer to not emit usual symbol prefix before the symbol name is used then -/// return linkage name after skipping this special LLVM prefix. -static StringRef getRealLinkageName(StringRef LinkageName) { - char One = '\1'; - if (LinkageName.startswith(StringRef(&One, 1))) - return LinkageName.substr(1); - return LinkageName; -} - +/// Remove any debug info for global variables/functions in the given module for +/// which said global variable/function no longer exists (i.e. is null). +/// +/// Debugging information is encoded in llvm IR using metadata. This is designed +/// such a way that debug info for symbols preserved even if symbols are +/// optimized away by the optimizer. This special pass removes debug info for +/// such symbols. bool StripDeadDebugInfo::runOnModule(Module &M) { bool Changed = false; - // Debugging infomration is encoded in llvm IR using metadata. This is designed - // such a way that debug info for symbols preserved even if symbols are - // optimized away by the optimizer. This special pass removes debug info for - // such symbols. - - // llvm.dbg.gv keeps track of debug info for global variables. - if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.gv")) { - SmallVector<MDNode *, 8> MDs; - for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) - if (DIGlobalVariable(NMD->getOperand(i)).Verify()) - MDs.push_back(NMD->getOperand(i)); - else - Changed = true; - NMD->eraseFromParent(); - NMD = NULL; - - for (SmallVector<MDNode *, 8>::iterator I = MDs.begin(), - E = MDs.end(); I != E; ++I) { - GlobalVariable *GV = DIGlobalVariable(*I).getGlobal(); - if (GV && M.getGlobalVariable(GV->getName(), true)) { - if (!NMD) - NMD = M.getOrInsertNamedMetadata("llvm.dbg.gv"); - NMD->addOperand(*I); - } + LLVMContext &C = M.getContext(); + + // Find all debug info in F. This is actually overkill in terms of what we + // want to do, but we want to try and be as resilient as possible in the face + // of potential debug info changes by using the formal interfaces given to us + // as much as possible. + DebugInfoFinder F; + F.processModule(M); + + // For each compile unit, find the live set of global variables/functions and + // replace the current list of potentially dead global variables/functions + // with the live list. + SmallVector<Value *, 64> LiveGlobalVariables; + SmallVector<Value *, 64> LiveSubprograms; + DenseSet<const MDNode *> VisitedSet; + + for (DebugInfoFinder::iterator CI = F.compile_unit_begin(), + CE = F.compile_unit_end(); CI != CE; ++CI) { + // Create our compile unit. + DICompileUnit DIC(*CI); + assert(DIC.Verify() && "DIC must verify as a DICompileUnit."); + + // Create our live subprogram list. + DIArray SPs = DIC.getSubprograms(); + bool SubprogramChange = false; + for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) { + DISubprogram DISP(SPs.getElement(i)); + assert(DISP.Verify() && "DISP must verify as a DISubprogram."); + + // Make sure we visit each subprogram only once. + if (!VisitedSet.insert(DISP).second) + continue; + + // If the function referenced by DISP is not null, the function is live. + if (DISP.getFunction()) + LiveSubprograms.push_back(DISP); else - Changed = true; + SubprogramChange = true; } - } - // llvm.dbg.sp keeps track of debug info for subprograms. - if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.sp")) { - SmallVector<MDNode *, 8> MDs; - for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) - if (DISubprogram(NMD->getOperand(i)).Verify()) - MDs.push_back(NMD->getOperand(i)); + // Create our live global variable list. + DIArray GVs = DIC.getGlobalVariables(); + bool GlobalVariableChange = false; + for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i) { + DIGlobalVariable DIG(GVs.getElement(i)); + assert(DIG.Verify() && "DIG must verify as DIGlobalVariable."); + + // Make sure we only visit each global variable only once. + if (!VisitedSet.insert(DIG).second) + continue; + + // If the global variable referenced by DIG is not null, the global + // variable is live. + if (DIG.getGlobal()) + LiveGlobalVariables.push_back(DIG); else - Changed = true; - NMD->eraseFromParent(); - NMD = NULL; - - for (SmallVector<MDNode *, 8>::iterator I = MDs.begin(), - E = MDs.end(); I != E; ++I) { - bool FnIsLive = false; - if (Function *F = DISubprogram(*I).getFunction()) - if (M.getFunction(F->getName())) - FnIsLive = true; - if (FnIsLive) { - if (!NMD) - NMD = M.getOrInsertNamedMetadata("llvm.dbg.sp"); - NMD->addOperand(*I); - } else { - // Remove llvm.dbg.lv.fnname named mdnode which may have been used - // to hold debug info for dead function's local variables. - StringRef FName = DISubprogram(*I).getLinkageName(); - if (FName.empty()) - FName = DISubprogram(*I).getName(); - if (NamedMDNode *LVNMD = - M.getNamedMetadata(Twine("llvm.dbg.lv.", - getRealLinkageName(FName)))) - LVNMD->eraseFromParent(); - } + GlobalVariableChange = true; + } + + // If we found dead subprograms or global variables, replace the current + // subprogram list/global variable list with our new live subprogram/global + // variable list. + if (SubprogramChange) { + // Make sure that 9 is still the index of the subprograms. This is to make + // sure that an assert is hit if the location of the subprogram array + // changes. This is just to make sure that this is updated if such an + // event occurs. + assert(DIC->getNumOperands() >= 10 && + SPs == DIC->getOperand(9) && + "DICompileUnits is expected to store Subprograms in operand " + "9."); + DIC->replaceOperandWith(9, MDNode::get(C, LiveSubprograms)); + Changed = true; } + + if (GlobalVariableChange) { + // Make sure that 10 is still the index of global variables. This is to + // make sure that an assert is hit if the location of the subprogram array + // changes. This is just to make sure that this index is updated if such + // an event occurs. + assert(DIC->getNumOperands() >= 11 && + GVs == DIC->getOperand(10) && + "DICompileUnits is expected to store Global Variables in operand " + "10."); + DIC->replaceOperandWith(10, MDNode::get(C, LiveGlobalVariables)); + Changed = true; + } + + // Reset lists for the next iteration. + LiveSubprograms.clear(); + LiveGlobalVariables.clear(); } return Changed; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h index 2a36074..a5eddc2 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h @@ -1,4 +1,4 @@ -//===- InstCombine.h - Main InstCombine pass definition -------------------===// +//===- InstCombine.h - Main InstCombine pass definition ---------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -158,8 +158,8 @@ public: ConstantInt *DivRHS); Instruction *FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *DivI, ConstantInt *DivRHS); - Instruction *FoldICmpAddOpCst(ICmpInst &ICI, Value *X, ConstantInt *CI, - ICmpInst::Predicate Pred, Value *TheAdd); + Instruction *FoldICmpAddOpCst(Instruction &ICI, Value *X, ConstantInt *CI, + ICmpInst::Predicate Pred); Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, ICmpInst::Predicate Cond, Instruction &I); Instruction *FoldShiftByConstant(Value *Op0, ConstantInt *Op1, @@ -178,6 +178,7 @@ public: Instruction *visitPtrToInt(PtrToIntInst &CI); Instruction *visitIntToPtr(IntToPtrInst &CI); Instruction *visitBitCast(BitCastInst &CI); + Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI); Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI); Instruction *FoldSelectIntoOp(SelectInst &SI, Value*, Value*); @@ -212,8 +213,8 @@ private: bool ShouldChangeType(Type *From, Type *To) const; Value *dyn_castNegVal(Value *V) const; Value *dyn_castFNegVal(Value *V, bool NoSignedZero=false) const; - Type *FindElementAtOffset(Type *Ty, int64_t Offset, - SmallVectorImpl<Value*> &NewIndices); + Type *FindElementAtOffset(Type *PtrTy, int64_t Offset, + SmallVectorImpl<Value*> &NewIndices); Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI); /// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually @@ -234,6 +235,7 @@ private: bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS); Value *EmitGEPOffset(User *GEP); Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN); + Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask); public: // InsertNewInstBefore - insert an instruction New before instruction Old @@ -270,7 +272,7 @@ public: if (&I == V) V = UndefValue::get(I.getType()); - DEBUG(errs() << "IC: Replacing " << I << "\n" + DEBUG(dbgs() << "IC: Replacing " << I << "\n" " with " << *V << '\n'); I.replaceAllUsesWith(V); @@ -282,7 +284,7 @@ public: // instruction. Instead, visit methods should return the value returned by // this function. Instruction *EraseInstFromFunction(Instruction &I) { - DEBUG(errs() << "IC: ERASE " << I << '\n'); + DEBUG(dbgs() << "IC: ERASE " << I << '\n'); assert(I.use_empty() && "Cannot erase instruction that is used!"); // Make sure that we reprocess all operands now that we reduced their diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 166f8df..534feb8 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "InstCombine.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/DataLayout.h" #include "llvm/Support/GetElementPtrTypeIterator.h" @@ -488,7 +489,7 @@ Value *FAddCombine::performFactorization(Instruction *I) { createFSub(AddSub0, AddSub1); if (ConstantFP *CFP = dyn_cast<ConstantFP>(NewAddSub)) { const APFloat &F = CFP->getValueAPF(); - if (!F.isNormal() || F.isDenormal()) + if (!F.isNormal()) return 0; } @@ -659,7 +660,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { } } - assert((NextTmpIdx <= sizeof(TmpResult)/sizeof(TmpResult[0]) + 1) && + assert((NextTmpIdx <= array_lengthof(TmpResult) + 1) && "out-of-bound access"); if (ConstAdd) @@ -876,7 +877,7 @@ static inline Value *dyn_castFoldableMul(Value *V, ConstantInt *&CST) { uint32_t BitWidth = cast<IntegerType>(V->getType())->getBitWidth(); uint32_t CSTVal = CST->getLimitedValue(BitWidth); CST = ConstantInt::get(V->getType()->getContext(), - APInt(BitWidth, 1).shl(CSTVal)); + APInt::getOneBitSet(BitWidth, CSTVal)); return I->getOperand(0); } return 0; @@ -1185,9 +1186,15 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { if (Value *V = SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), TD)) return ReplaceInstUsesWith(I, V); - if (isa<Constant>(RHS) && isa<PHINode>(LHS)) - if (Instruction *NV = FoldOpIntoPhi(I)) - return NV; + if (isa<Constant>(RHS)) { + if (isa<PHINode>(LHS)) + if (Instruction *NV = FoldOpIntoPhi(I)) + return NV; + + if (SelectInst *SI = dyn_cast<SelectInst>(LHS)) + if (Instruction *NV = FoldOpIntoSelect(I, SI)) + return NV; + } // -A + B --> B - A // -A + -B --> -(A + B) @@ -1516,9 +1523,33 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) { if (Value *V = SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), TD)) return ReplaceInstUsesWith(I, V); - // If this is a 'B = x-(-A)', change to B = x+A... - if (Value *V = dyn_castFNegVal(Op1)) - return BinaryOperator::CreateFAdd(Op0, V); + if (isa<Constant>(Op0)) + if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) + if (Instruction *NV = FoldOpIntoSelect(I, SI)) + return NV; + + // If this is a 'B = x-(-A)', change to B = x+A, potentially looking + // through FP extensions/truncations along the way. + if (Value *V = dyn_castFNegVal(Op1)) { + Instruction *NewI = BinaryOperator::CreateFAdd(Op0, V); + NewI->copyFastMathFlags(&I); + return NewI; + } + if (FPTruncInst *FPTI = dyn_cast<FPTruncInst>(Op1)) { + if (Value *V = dyn_castFNegVal(FPTI->getOperand(0))) { + Value *NewTrunc = Builder->CreateFPTrunc(V, I.getType()); + Instruction *NewI = BinaryOperator::CreateFAdd(Op0, NewTrunc); + NewI->copyFastMathFlags(&I); + return NewI; + } + } else if (FPExtInst *FPEI = dyn_cast<FPExtInst>(Op1)) { + if (Value *V = dyn_castFNegVal(FPEI->getOperand(0))) { + Value *NewExt = Builder->CreateFPExt(V, I.getType()); + Instruction *NewI = BinaryOperator::CreateFAdd(Op0, NewExt); + NewI->copyFastMathFlags(&I); + return NewI; + } + } if (I.hasUnsafeAlgebra()) { if (Value *V = FAddCombine(Builder).simplify(&I)) diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index ec75dd2..88bb69b 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -173,14 +173,14 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op, // Adding a one to a single bit bit-field should be turned into an XOR // of the bit. First thing to check is to see if this AND is with a // single bit constant. - const APInt &AndRHSV = cast<ConstantInt>(AndRHS)->getValue(); + const APInt &AndRHSV = AndRHS->getValue(); // If there is only one bit set. if (AndRHSV.isPowerOf2()) { // Ok, at this point, we know that we are masking the result of the // ADD down to exactly one bit. If the constant we are adding has // no bits set below this bit, then we can eliminate the ADD. - const APInt& AddRHS = cast<ConstantInt>(OpRHS)->getValue(); + const APInt& AddRHS = OpRHS->getValue(); // Check to see if any bits below the one bit set in AndRHSV are set. if ((AddRHS & (AndRHSV-1)) == 0) { @@ -209,8 +209,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op, uint32_t BitWidth = AndRHS->getType()->getBitWidth(); uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth); APInt ShlMask(APInt::getHighBitsSet(BitWidth, BitWidth-OpRHSVal)); - ConstantInt *CI = ConstantInt::get(AndRHS->getContext(), - AndRHS->getValue() & ShlMask); + ConstantInt *CI = Builder->getInt(AndRHS->getValue() & ShlMask); if (CI->getValue() == ShlMask) // Masking out bits that the shift already masks. @@ -230,8 +229,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op, uint32_t BitWidth = AndRHS->getType()->getBitWidth(); uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth); APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal)); - ConstantInt *CI = ConstantInt::get(Op->getContext(), - AndRHS->getValue() & ShrMask); + ConstantInt *CI = Builder->getInt(AndRHS->getValue() & ShrMask); if (CI->getValue() == ShrMask) // Masking out bits that the shift already masks. @@ -251,8 +249,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op, uint32_t BitWidth = AndRHS->getType()->getBitWidth(); uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth); APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal)); - Constant *C = ConstantInt::get(Op->getContext(), - AndRHS->getValue() & ShrMask); + Constant *C = Builder->getInt(AndRHS->getValue() & ShrMask); if (C == AndRHS) { // Masking out bits shifted in. // (Val ashr C1) & C2 -> (Val lshr C1) & C2 // Make the argument unsigned. @@ -279,7 +276,7 @@ Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, if (Inside) { if (Lo == Hi) // Trivially false. - return ConstantInt::getFalse(V->getContext()); + return Builder->getFalse(); // V >= Min && V < Hi --> V < Hi if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) { @@ -296,7 +293,7 @@ Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, } if (Lo == Hi) // Trivially true. - return ConstantInt::getTrue(V->getContext()); + return Builder->getTrue(); // V < Min || V >= Hi -> V > Hi-1 Hi = SubOne(cast<ConstantInt>(Hi)); @@ -491,6 +488,26 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C, return result; } +/// Convert an analysis of a masked ICmp into its equivalent if all boolean +/// operations had the opposite sense. Since each "NotXXX" flag (recording !=) +/// is adjacent to the corresponding normal flag (recording ==), this just +/// involves swapping those bits over. +static unsigned conjugateICmpMask(unsigned Mask) { + unsigned NewMask; + NewMask = (Mask & (FoldMskICmp_AMask_AllOnes | FoldMskICmp_BMask_AllOnes | + FoldMskICmp_Mask_AllZeroes | FoldMskICmp_AMask_Mixed | + FoldMskICmp_BMask_Mixed)) + << 1; + + NewMask |= + (Mask & (FoldMskICmp_AMask_NotAllOnes | FoldMskICmp_BMask_NotAllOnes | + FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_AMask_NotMixed | + FoldMskICmp_BMask_NotMixed)) + >> 1; + + return NewMask; +} + /// decomposeBitTestICmp - Decompose an icmp into the form ((X & Y) pred Z) /// if possible. The returned predicate is either == or !=. Returns false if /// decomposition fails. @@ -551,14 +568,22 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, L21 = L22 = L1 = 0; } else { // Look for ANDs in the LHS icmp. - if (match(L1, m_And(m_Value(L11), m_Value(L12)))) { - if (!match(L2, m_And(m_Value(L21), m_Value(L22)))) - L21 = L22 = 0; - } else { - if (!match(L2, m_And(m_Value(L11), m_Value(L12)))) - return 0; - std::swap(L1, L2); + if (!L1->getType()->isIntegerTy()) { + // You can icmp pointers, for example. They really aren't masks. + L11 = L12 = 0; + } else if (!match(L1, m_And(m_Value(L11), m_Value(L12)))) { + // Any icmp can be viewed as being trivially masked; if it allows us to + // remove one, it's worth it. + L11 = L1; + L12 = Constant::getAllOnesValue(L1->getType()); + } + + if (!L2->getType()->isIntegerTy()) { + // You can icmp pointers, for example. They really aren't masks. L21 = L22 = 0; + } else if (!match(L2, m_And(m_Value(L21), m_Value(L22)))) { + L21 = L2; + L22 = Constant::getAllOnesValue(L2->getType()); } } @@ -579,7 +604,14 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, return 0; } E = R2; R1 = 0; ok = true; - } else if (match(R1, m_And(m_Value(R11), m_Value(R12)))) { + } else if (R1->getType()->isIntegerTy()) { + if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) { + // As before, model no mask as a trivial mask if it'll let us do an + // optimisation. + R11 = R1; + R12 = Constant::getAllOnesValue(R1->getType()); + } + if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) { A = R11; D = R12; E = R2; ok = true; } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) { @@ -592,7 +624,12 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, return 0; // Look for ANDs in on the right side of the RHS icmp. - if (!ok && match(R2, m_And(m_Value(R11), m_Value(R12)))) { + if (!ok && R2->getType()->isIntegerTy()) { + if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) { + R11 = R2; + R12 = Constant::getAllOnesValue(R2->getType()); + } + if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) { A = R11; D = R12; E = R1; ok = true; } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) { @@ -621,8 +658,7 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, /// foldLogOpOfMaskedICmps: /// try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) /// into a single (icmp(A & X) ==/!= Y) -static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, - ICmpInst::Predicate NEWCC, +static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, llvm::InstCombiner::BuilderTy* Builder) { Value *A = 0, *B = 0, *C = 0, *D = 0, *E = 0; ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); @@ -632,8 +668,24 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, assert(ICmpInst::isEquality(LHSCC) && ICmpInst::isEquality(RHSCC) && "foldLogOpOfMaskedICmpsHelper must return an equality predicate."); - if (NEWCC == ICmpInst::ICMP_NE) - mask >>= 1; // treat "Not"-states as normal states + // In full generality: + // (icmp (A & B) Op C) | (icmp (A & D) Op E) + // == ![ (icmp (A & B) !Op C) & (icmp (A & D) !Op E) ] + // + // If the latter can be converted into (icmp (A & X) Op Y) then the former is + // equivalent to (icmp (A & X) !Op Y). + // + // Therefore, we can pretend for the rest of this function that we're dealing + // with the conjunction, provided we flip the sense of any comparisons (both + // input and output). + + // In most cases we're going to produce an EQ for the "&&" case. + ICmpInst::Predicate NEWCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE; + if (!IsAnd) { + // Convert the masking analysis into its equivalent with negated + // comparisons. + mask = conjugateICmpMask(mask); + } if (mask & FoldMskICmp_Mask_AllZeroes) { // (icmp eq (A & B), 0) & (icmp eq (A & D), 0) @@ -660,6 +712,40 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, Value* newAnd = Builder->CreateAnd(A, newAnd1); return Builder->CreateICmp(NEWCC, newAnd, A); } + + // Remaining cases assume at least that B and D are constant, and depend on + // their actual values. This isn't strictly, necessary, just a "handle the + // easy cases for now" decision. + ConstantInt *BCst = dyn_cast<ConstantInt>(B); + if (BCst == 0) return 0; + ConstantInt *DCst = dyn_cast<ConstantInt>(D); + if (DCst == 0) return 0; + + if (mask & (FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_BMask_NotAllOnes)) { + // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and + // (icmp ne (A & B), B) & (icmp ne (A & D), D) + // -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0) + // Only valid if one of the masks is a superset of the other (check "B&D" is + // the same as either B or D). + APInt NewMask = BCst->getValue() & DCst->getValue(); + + if (NewMask == BCst->getValue()) + return LHS; + else if (NewMask == DCst->getValue()) + return RHS; + } + if (mask & FoldMskICmp_AMask_NotAllOnes) { + // (icmp ne (A & B), B) & (icmp ne (A & D), D) + // -> (icmp ne (A & B), A) or (icmp ne (A & D), A) + // Only valid if one of the masks is a superset of the other (check "B|D" is + // the same as either B or D). + APInt NewMask = BCst->getValue() | DCst->getValue(); + + if (NewMask == BCst->getValue()) + return LHS; + else if (NewMask == DCst->getValue()) + return RHS; + } if (mask & FoldMskICmp_BMask_Mixed) { // (icmp eq (A & B), C) & (icmp eq (A & D), E) // We already know that B & C == C && D & E == E. @@ -668,14 +754,9 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, // contradict, then we can transform to // -> (icmp eq (A & (B|D)), (C|E)) // Currently, we only handle the case of B, C, D, and E being constant. - ConstantInt *BCst = dyn_cast<ConstantInt>(B); - if (BCst == 0) return 0; - ConstantInt *DCst = dyn_cast<ConstantInt>(D); - if (DCst == 0) return 0; // we can't simply use C and E, because we might actually handle // (icmp ne (A & B), B) & (icmp eq (A & D), D) // with B and D, having a single bit set - ConstantInt *CCst = dyn_cast<ConstantInt>(C); if (CCst == 0) return 0; if (LHSCC != NEWCC) @@ -718,7 +799,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { } // handle (roughly): (icmp eq (A & B), C) & (icmp eq (A & D), E) - if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, ICmpInst::ICMP_EQ, Builder)) + if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, true, Builder)) return V; // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2). @@ -852,10 +933,15 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { case ICmpInst::ICMP_SGT: // (X != 13 & X s> 15) -> X s> 15 return RHS; case ICmpInst::ICMP_NE: + // Special case to get the ordering right when the values wrap around + // zero. + if (LHSCst->getValue() == 0 && RHSCst->getValue().isAllOnesValue()) + std::swap(LHSCst, RHSCst); if (LHSCst == SubOne(RHSCst)){// (X != 13 & X != 14) -> X-13 >u 1 Constant *AddCST = ConstantExpr::getNeg(LHSCst); Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off"); - return Builder->CreateICmpUGT(Add, ConstantInt::get(Add->getType(), 1)); + return Builder->CreateICmpUGT(Add, ConstantInt::get(Add->getType(), 1), + Val->getName()+".cmp"); } break; // (X != 13 & X != 15) -> no change } @@ -943,7 +1029,7 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { // If either of the constants are nans, then the whole thing returns // false. if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN()) - return ConstantInt::getFalse(LHS->getContext()); + return Builder->getFalse(); return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0)); } @@ -1302,7 +1388,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { /// always in the local (OverallLeftShift) coordinate space. /// static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask, - SmallVector<Value*, 8> &ByteValues) { + SmallVectorImpl<Value *> &ByteValues) { if (Instruction *I = dyn_cast<Instruction>(V)) { // If this is an or instruction, it may be an inner node of the bswap. if (I->getOpcode() == Instruction::Or) { @@ -1380,7 +1466,7 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask, // into a byteswap. At least one of the two bytes would not be aligned with // their ultimate destination. if (!isPowerOf2_32(ByteMask)) return true; - unsigned InputByteNo = CountTrailingZeros_32(ByteMask); + unsigned InputByteNo = countTrailingZeros(ByteMask); // 2) The input and ultimate destinations must line up: if byte 3 of an i32 // is demanded, it needs to go into byte 0 of the result. This means that the @@ -1457,10 +1543,60 @@ static Instruction *MatchSelectFromAndOr(Value *A, Value *B, return 0; } +/// IsOneHotValue - Returns true for "one-hot" values (values where at most +/// one bit can be set). +static bool IsOneHotValue(Value *V) { + // Match 1<<K. + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) + if (BO->getOpcode() == Instruction::Shl) { + ConstantInt *One = dyn_cast<ConstantInt>(BO->getOperand(0)); + return One && One->isOne(); + } + + // Check for power of two integer constants. + if (ConstantInt *K = dyn_cast<ConstantInt>(V)) + return K->getValue().isPowerOf2(); + + return false; +} + /// FoldOrOfICmps - Fold (icmp)|(icmp) if possible. Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); + // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2) + // if K1 and K2 are a one-bit mask. + ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1)); + ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1)); + + if (LHS->getPredicate() == ICmpInst::ICMP_EQ && LHSCst && LHSCst->isZero() && + RHS->getPredicate() == ICmpInst::ICMP_EQ && RHSCst && RHSCst->isZero()) { + + BinaryOperator *LAnd = dyn_cast<BinaryOperator>(LHS->getOperand(0)); + BinaryOperator *RAnd = dyn_cast<BinaryOperator>(RHS->getOperand(0)); + if (LAnd && RAnd && LAnd->hasOneUse() && RHS->hasOneUse() && + LAnd->getOpcode() == Instruction::And && + RAnd->getOpcode() == Instruction::And) { + + Value *Mask = 0; + Value *Masked = 0; + if (LAnd->getOperand(0) == RAnd->getOperand(0) && + IsOneHotValue(LAnd->getOperand(1)) && + IsOneHotValue(RAnd->getOperand(1))) { + Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1)); + Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask); + } else if (LAnd->getOperand(1) == RAnd->getOperand(1) && + IsOneHotValue(LAnd->getOperand(0)) && + IsOneHotValue(RAnd->getOperand(0))) { + Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0)); + Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask); + } + + if (Masked) + return Builder->CreateICmp(ICmpInst::ICMP_NE, Masked, Mask); + } + } + // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B) if (PredicatesFoldable(LHSCC, RHSCC)) { if (LHS->getOperand(0) == RHS->getOperand(1) && @@ -1477,13 +1613,37 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { // handle (roughly): // (icmp ne (A & B), C) | (icmp ne (A & D), E) - if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, ICmpInst::ICMP_NE, Builder)) + if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, false, Builder)) return V; - // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2). Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0); - ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1)); - ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1)); + if (LHS->hasOneUse() || RHS->hasOneUse()) { + // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1) + // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1) + Value *A = 0, *B = 0; + if (LHSCC == ICmpInst::ICMP_EQ && LHSCst && LHSCst->isZero()) { + B = Val; + if (RHSCC == ICmpInst::ICMP_ULT && Val == RHS->getOperand(1)) + A = Val2; + else if (RHSCC == ICmpInst::ICMP_UGT && Val == Val2) + A = RHS->getOperand(1); + } + // (icmp ult A, B) | (icmp eq B, 0) -> (icmp ule A, B-1) + // (icmp ugt B, A) | (icmp eq B, 0) -> (icmp ule A, B-1) + else if (RHSCC == ICmpInst::ICMP_EQ && RHSCst && RHSCst->isZero()) { + B = Val2; + if (LHSCC == ICmpInst::ICMP_ULT && Val2 == LHS->getOperand(1)) + A = Val; + else if (LHSCC == ICmpInst::ICMP_UGT && Val2 == Val) + A = LHS->getOperand(1); + } + if (A && B) + return Builder->CreateICmp( + ICmpInst::ICMP_UGE, + Builder->CreateAdd(B, ConstantInt::getSigned(B->getType(), -1)), A); + } + + // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2). if (LHSCst == 0 || RHSCst == 0) return 0; if (LHSCst == RHSCst && LHSCC == RHSCC) { @@ -1588,7 +1748,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { case ICmpInst::ICMP_NE: // (X != 13 | X != 15) -> true case ICmpInst::ICMP_ULT: // (X != 13 | X u< 15) -> true case ICmpInst::ICMP_SLT: // (X != 13 | X s< 15) -> true - return ConstantInt::getTrue(LHS->getContext()); + return Builder->getTrue(); } case ICmpInst::ICMP_ULT: switch (RHSCC) { @@ -1640,7 +1800,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { break; case ICmpInst::ICMP_NE: // (X u> 13 | X != 15) -> true case ICmpInst::ICMP_ULT: // (X u> 13 | X u< 15) -> true - return ConstantInt::getTrue(LHS->getContext()); + return Builder->getTrue(); case ICmpInst::ICMP_SLT: // (X u> 13 | X s< 15) -> no change break; } @@ -1655,7 +1815,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { break; case ICmpInst::ICMP_NE: // (X s> 13 | X != 15) -> true case ICmpInst::ICMP_SLT: // (X s> 13 | X s< 15) -> true - return ConstantInt::getTrue(LHS->getContext()); + return Builder->getTrue(); case ICmpInst::ICMP_ULT: // (X s> 13 | X u< 15) -> no change break; } @@ -1676,7 +1836,7 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { // If either of the constants are nans, then the whole thing returns // true. if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN()) - return ConstantInt::getTrue(LHS->getContext()); + return Builder->getTrue(); // Otherwise, no need to compare the two constants, compare the // rest. @@ -1779,8 +1939,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { Value *Or = Builder->CreateOr(X, RHS); Or->takeName(Op0); return BinaryOperator::CreateAnd(Or, - ConstantInt::get(I.getContext(), - RHS->getValue() | C1->getValue())); + Builder->getInt(RHS->getValue() | C1->getValue())); } // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2) @@ -1789,8 +1948,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { Value *Or = Builder->CreateOr(X, RHS); Or->takeName(Op0); return BinaryOperator::CreateXor(Or, - ConstantInt::get(I.getContext(), - C1->getValue() & ~RHS->getValue())); + Builder->getInt(C1->getValue() & ~RHS->getValue())); } // Try to fold constant and into select arguments. @@ -1872,15 +2030,13 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { ((V1 == B && MaskedValueIsZero(V2, ~C1->getValue())) || // (V|N) (V2 == B && MaskedValueIsZero(V1, ~C1->getValue())))) // (N|V) return BinaryOperator::CreateAnd(A, - ConstantInt::get(A->getContext(), - C1->getValue()|C2->getValue())); + Builder->getInt(C1->getValue()|C2->getValue())); // Or commutes, try both ways. if (match(B, m_Or(m_Value(V1), m_Value(V2))) && ((V1 == A && MaskedValueIsZero(V2, ~C2->getValue())) || // (V|N) (V2 == A && MaskedValueIsZero(V1, ~C2->getValue())))) // (N|V) return BinaryOperator::CreateAnd(B, - ConstantInt::get(B->getContext(), - C1->getValue()|C2->getValue())); + Builder->getInt(C1->getValue()|C2->getValue())); // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2) // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0. @@ -1891,8 +2047,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { (C4->getValue() & ~C2->getValue()) == 0) { V2 = Builder->CreateOr(V1, ConstantExpr::getOr(C3, C4), "bitfield"); return BinaryOperator::CreateAnd(V2, - ConstantInt::get(B->getContext(), - C1->getValue()|C2->getValue())); + Builder->getInt(C1->getValue()|C2->getValue())); } } } @@ -2160,8 +2315,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { if (CI->hasOneUse() && Op0C->hasOneUse()) { Instruction::CastOps Opcode = Op0C->getOpcode(); if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) && - (RHS == ConstantExpr::getCast(Opcode, - ConstantInt::getTrue(I.getContext()), + (RHS == ConstantExpr::getCast(Opcode, Builder->getTrue(), Op0C->getDestTy()))) { CI->setPredicate(CI->getInversePredicate()); return CastInst::Create(Opcode, CI, Op0C->getType()); @@ -2191,8 +2345,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { Op0I->getOperand(0)); } else if (RHS->getValue().isSignBit()) { // (X + C) ^ signbit -> (X + C + signbit) - Constant *C = ConstantInt::get(I.getContext(), - RHS->getValue() + Op0CI->getValue()); + Constant *C = Builder->getInt(RHS->getValue() + Op0CI->getValue()); return BinaryOperator::CreateAdd(Op0I->getOperand(0), C); } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 78b4a2c..0cd7b14 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -946,7 +946,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { int ix = FTy->getNumParams(); // See if we can optimize any arguments passed through the varargs area of // the call. - for (CallSite::arg_iterator I = CS.arg_begin()+FTy->getNumParams(), + for (CallSite::arg_iterator I = CS.arg_begin() + FTy->getNumParams(), E = CS.arg_end(); I != E; ++I, ++ix) { CastInst *CI = dyn_cast<CastInst>(*I); if (CI && isSafeToEliminateVarargsCast(CS, CI, TD, ix)) { @@ -999,19 +999,15 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { // Check to see if we are changing the return type... if (OldRetTy != NewRetTy) { - if (Callee->isDeclaration() && - // Conversion is ok if changing from one pointer type to another or from - // a pointer to an integer of the same size. - !((OldRetTy->isPointerTy() || !TD || - OldRetTy == TD->getIntPtrType(Caller->getContext())) && - (NewRetTy->isPointerTy() || !TD || - NewRetTy == TD->getIntPtrType(Caller->getContext())))) - return false; // Cannot transform this return value. + if (!CastInst::isBitCastable(NewRetTy, OldRetTy)) { + if (Callee->isDeclaration()) + return false; // Cannot transform this return value. - if (!Caller->use_empty() && - // void -> non-void is handled specially - !NewRetTy->isVoidTy() && !CastInst::isCastable(NewRetTy, OldRetTy)) + if (!Caller->use_empty() && + // void -> non-void is handled specially + !NewRetTy->isVoidTy()) return false; // Cannot transform this return value. + } if (!CallerPAL.isEmpty() && !Caller->use_empty()) { AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex); @@ -1036,7 +1032,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { return false; } - unsigned NumActualArgs = unsigned(CS.arg_end()-CS.arg_begin()); + unsigned NumActualArgs = CS.arg_size(); unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs); CallSite::arg_iterator AI = CS.arg_begin(); @@ -1044,7 +1040,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { Type *ParamTy = FT->getParamType(i); Type *ActTy = (*AI)->getType(); - if (!CastInst::isCastable(ActTy, ParamTy)) + if (!CastInst::isBitCastable(ActTy, ParamTy)) return false; // Cannot transform this parameter value. if (AttrBuilder(CallerPAL.getParamAttributes(i + 1), i + 1). @@ -1061,20 +1057,11 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || TD == 0) return false; - Type *CurElTy = cast<PointerType>(ActTy)->getElementType(); + Type *CurElTy = ActTy->getPointerElementType(); if (TD->getTypeAllocSize(CurElTy) != TD->getTypeAllocSize(ParamPTy->getElementType())) return false; } - - // Converting from one pointer type to another or between a pointer and an - // integer of the same size is safe even if we do not have a body. - bool isConvertible = ActTy == ParamTy || - (TD && ((ParamTy->isPointerTy() || - ParamTy == TD->getIntPtrType(Caller->getContext())) && - (ActTy->isPointerTy() || - ActTy == TD->getIntPtrType(Caller->getContext())))); - if (Callee->isDeclaration() && !isConvertible) return false; } if (Callee->isDeclaration()) { @@ -1141,12 +1128,11 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { AI = CS.arg_begin(); for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) { Type *ParamTy = FT->getParamType(i); + if ((*AI)->getType() == ParamTy) { Args.push_back(*AI); } else { - Instruction::CastOps opcode = CastInst::getCastOpcode(*AI, - false, ParamTy, false); - Args.push_back(Builder->CreateCast(opcode, *AI, ParamTy)); + Args.push_back(Builder->CreateBitCast(*AI, ParamTy)); } // Add any parameter attributes. @@ -1217,9 +1203,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { Value *NV = NC; if (OldRetTy != NV->getType() && !Caller->use_empty()) { if (!NV->getType()->isVoidTy()) { - Instruction::CastOps opcode = - CastInst::getCastOpcode(NC, false, OldRetTy, false); - NV = NC = CastInst::Create(opcode, NC, OldRetTy); + NV = NC = CastInst::Create(CastInst::BitCast, NC, OldRetTy); NC->setDebugLoc(Caller->getDebugLoc()); // If this is an invoke instruction, we should insert it after the first @@ -1287,7 +1271,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, if (NestTy) { Instruction *Caller = CS.getInstruction(); std::vector<Value*> NewArgs; - NewArgs.reserve(unsigned(CS.arg_end()-CS.arg_begin())+1); + NewArgs.reserve(CS.arg_size() + 1); SmallVector<AttributeSet, 8> NewAttrs; NewAttrs.reserve(Attrs.getNumSlots() + 1); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 2ee1278..72377dc 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -677,7 +677,6 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) { case Instruction::Add: case Instruction::Sub: case Instruction::Mul: - case Instruction::Shl: if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear) || !CanEvaluateZExtd(I->getOperand(1), Ty, Tmp)) return false; @@ -701,6 +700,17 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) { // Otherwise, we don't know how to analyze this BitsToClear case yet. return false; + case Instruction::Shl: + // We can promote shl(x, cst) if we can promote x. Since shl overwrites the + // upper bits we can reduce BitsToClear by the shift amount. + if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) { + if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear)) + return false; + uint64_t ShiftAmt = Amt->getZExtValue(); + BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0; + return true; + } + return false; case Instruction::LShr: // We can promote lshr(x, cst) if we can promote x. This requires the // ultimate 'and' to clear out the high zero bits we're clearing out though. @@ -1219,6 +1229,19 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { } } + // (fptrunc (select cond, R1, Cst)) --> + // (select cond, (fptrunc R1), (fptrunc Cst)) + SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0)); + if (SI && + (isa<ConstantFP>(SI->getOperand(1)) || + isa<ConstantFP>(SI->getOperand(2)))) { + Value *LHSTrunc = Builder->CreateFPTrunc(SI->getOperand(1), + CI.getType()); + Value *RHSTrunc = Builder->CreateFPTrunc(SI->getOperand(2), + CI.getType()); + return SelectInst::Create(SI->getOperand(0), LHSTrunc, RHSTrunc); + } + IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI.getOperand(0)); if (II) { switch (II->getIntrinsicID()) { @@ -1239,9 +1262,14 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { } // Fold (fptrunc (sqrt (fpext x))) -> (sqrtf x) + // Note that we restrict this transformation based on + // TLI->has(LibFunc::sqrtf), even for the sqrt intrinsic, because + // TLI->has(LibFunc::sqrtf) is sufficient to guarantee that the + // single-precision intrinsic can be expanded in the backend. CallInst *Call = dyn_cast<CallInst>(CI.getOperand(0)); if (Call && Call->getCalledFunction() && TLI->has(LibFunc::sqrtf) && - Call->getCalledFunction()->getName() == TLI->getName(LibFunc::sqrt) && + (Call->getCalledFunction()->getName() == TLI->getName(LibFunc::sqrt) || + Call->getCalledFunction()->getIntrinsicID() == Intrinsic::sqrt) && Call->getNumArgOperands() == 1 && Call->hasOneUse()) { CastInst *Arg = dyn_cast<CastInst>(Call->getArgOperand(0)); @@ -1252,11 +1280,11 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { Arg->getOperand(0)->getType()->isFloatTy()) { Function *Callee = Call->getCalledFunction(); Module *M = CI.getParent()->getParent()->getParent(); - Constant *SqrtfFunc = M->getOrInsertFunction("sqrtf", - Callee->getAttributes(), - Builder->getFloatTy(), - Builder->getFloatTy(), - NULL); + Constant *SqrtfFunc = (Callee->getIntrinsicID() == Intrinsic::sqrt) ? + Intrinsic::getDeclaration(M, Intrinsic::sqrt, Builder->getFloatTy()) : + M->getOrInsertFunction("sqrtf", Callee->getAttributes(), + Builder->getFloatTy(), Builder->getFloatTy(), + NULL); CallInst *ret = CallInst::Create(SqrtfFunc, Arg->getOperand(0), "sqrtfcall"); ret->setAttributes(Callee->getAttributes()); @@ -1328,14 +1356,18 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) { // If the source integer type is not the intptr_t type for this target, do a // trunc or zext to the intptr_t type, then inttoptr of it. This allows the // cast to be exposed to other transforms. - if (TD && CI.getOperand(0)->getType()->getScalarSizeInBits() != - TD->getPointerSizeInBits()) { - Type *Ty = TD->getIntPtrType(CI.getContext()); - if (CI.getType()->isVectorTy()) // Handle vectors of pointers. - Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements()); - - Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty); - return new IntToPtrInst(P, CI.getType()); + + if (TD) { + unsigned AS = CI.getAddressSpace(); + if (CI.getOperand(0)->getType()->getScalarSizeInBits() != + TD->getPointerSizeInBits(AS)) { + Type *Ty = TD->getIntPtrType(CI.getContext(), AS); + if (CI.getType()->isVectorTy()) // Handle vectors of pointers. + Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements()); + + Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty); + return new IntToPtrInst(P, CI.getType()); + } } if (Instruction *I = commonCastTransforms(CI)) @@ -1360,25 +1392,32 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) { return &CI; } + if (!TD) + return commonCastTransforms(CI); + // If the GEP has a single use, and the base pointer is a bitcast, and the // GEP computes a constant offset, see if we can convert these three // instructions into fewer. This typically happens with unions and other // non-type-safe code. - APInt Offset(TD ? TD->getPointerSizeInBits() : 1, 0); - if (TD && GEP->hasOneUse() && isa<BitCastInst>(GEP->getOperand(0)) && + unsigned AS = GEP->getPointerAddressSpace(); + unsigned OffsetBits = TD->getPointerSizeInBits(AS); + APInt Offset(OffsetBits, 0); + BitCastInst *BCI = dyn_cast<BitCastInst>(GEP->getOperand(0)); + if (GEP->hasOneUse() && + BCI && GEP->accumulateConstantOffset(*TD, Offset)) { // Get the base pointer input of the bitcast, and the type it points to. - Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0); - Type *GEPIdxTy = - cast<PointerType>(OrigBase->getType())->getElementType(); + Value *OrigBase = BCI->getOperand(0); SmallVector<Value*, 8> NewIndices; - if (FindElementAtOffset(GEPIdxTy, Offset.getSExtValue(), NewIndices)) { + if (FindElementAtOffset(OrigBase->getType(), + Offset.getSExtValue(), + NewIndices)) { // If we were able to index down into an element, create the GEP // and bitcast the result. This eliminates one bitcast, potentially // two. Value *NGEP = cast<GEPOperator>(GEP)->isInBounds() ? - Builder->CreateInBoundsGEP(OrigBase, NewIndices) : - Builder->CreateGEP(OrigBase, NewIndices); + Builder->CreateInBoundsGEP(OrigBase, NewIndices) : + Builder->CreateGEP(OrigBase, NewIndices); NGEP->takeName(GEP); if (isa<BitCastInst>(CI)) @@ -1396,16 +1435,22 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) { // If the destination integer type is not the intptr_t type for this target, // do a ptrtoint to intptr_t then do a trunc or zext. This allows the cast // to be exposed to other transforms. - if (TD && CI.getType()->getScalarSizeInBits() != TD->getPointerSizeInBits()) { - Type *Ty = TD->getIntPtrType(CI.getContext()); - if (CI.getType()->isVectorTy()) // Handle vectors of pointers. - Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements()); - Value *P = Builder->CreatePtrToInt(CI.getOperand(0), Ty); - return CastInst::CreateIntegerCast(P, CI.getType(), /*isSigned=*/false); - } + if (!TD) + return commonPointerCastTransforms(CI); + + Type *Ty = CI.getType(); + unsigned AS = CI.getPointerAddressSpace(); + + if (Ty->getScalarSizeInBits() == TD->getPointerSizeInBits(AS)) + return commonPointerCastTransforms(CI); - return commonPointerCastTransforms(CI); + Type *PtrTy = TD->getIntPtrType(CI.getContext(), AS); + if (Ty->isVectorTy()) // Handle vectors of pointers. + PtrTy = VectorType::get(PtrTy, Ty->getVectorNumElements()); + + Value *P = Builder->CreatePtrToInt(CI.getOperand(0), PtrTy); + return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false); } /// OptimizeVectorResize - This input value (which is known to have vector type) @@ -1478,12 +1523,17 @@ static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) { /// insertions into the vector. See the example in the comment for /// OptimizeIntegerToVectorInsertions for the pattern this handles. /// The type of V is always a non-zero multiple of VecEltTy's size. +/// Shift is the number of bits between the lsb of V and the lsb of +/// the vector. /// /// This returns false if the pattern can't be matched or true if it can, /// filling in Elements with the elements found here. -static bool CollectInsertionElements(Value *V, unsigned ElementIndex, +static bool CollectInsertionElements(Value *V, unsigned Shift, SmallVectorImpl<Value*> &Elements, - Type *VecEltTy) { + Type *VecEltTy, InstCombiner &IC) { + assert(isMultipleOfTypeSize(Shift, VecEltTy) && + "Shift should be a multiple of the element type size"); + // Undef values never contribute useful bits to the result. if (isa<UndefValue>(V)) return true; @@ -1495,8 +1545,12 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex, if (C->isNullValue()) return true; + unsigned ElementIndex = getTypeSizeIndex(Shift, VecEltTy); + if (IC.getDataLayout()->isBigEndian()) + ElementIndex = Elements.size() - ElementIndex - 1; + // Fail if multiple elements are inserted into this slot. - if (ElementIndex >= Elements.size() || Elements[ElementIndex] != 0) + if (Elements[ElementIndex] != 0) return false; Elements[ElementIndex] = V; @@ -1512,7 +1566,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex, // it to the right type so it gets properly inserted. if (NumElts == 1) return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy), - ElementIndex, Elements, VecEltTy); + Shift, Elements, VecEltTy, IC); // Okay, this is a constant that covers multiple elements. Slice it up into // pieces and insert each element-sized piece into the vector. @@ -1523,10 +1577,11 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex, Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize); for (unsigned i = 0; i != NumElts; ++i) { + unsigned ShiftI = Shift+i*ElementSize; Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(), - i*ElementSize)); + ShiftI)); Piece = ConstantExpr::getTrunc(Piece, ElementIntTy); - if (!CollectInsertionElements(Piece, ElementIndex+i, Elements, VecEltTy)) + if (!CollectInsertionElements(Piece, ShiftI, Elements, VecEltTy, IC)) return false; } return true; @@ -1539,29 +1594,28 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex, switch (I->getOpcode()) { default: return false; // Unhandled case. case Instruction::BitCast: - return CollectInsertionElements(I->getOperand(0), ElementIndex, - Elements, VecEltTy); + return CollectInsertionElements(I->getOperand(0), Shift, + Elements, VecEltTy, IC); case Instruction::ZExt: if (!isMultipleOfTypeSize( I->getOperand(0)->getType()->getPrimitiveSizeInBits(), VecEltTy)) return false; - return CollectInsertionElements(I->getOperand(0), ElementIndex, - Elements, VecEltTy); + return CollectInsertionElements(I->getOperand(0), Shift, + Elements, VecEltTy, IC); case Instruction::Or: - return CollectInsertionElements(I->getOperand(0), ElementIndex, - Elements, VecEltTy) && - CollectInsertionElements(I->getOperand(1), ElementIndex, - Elements, VecEltTy); + return CollectInsertionElements(I->getOperand(0), Shift, + Elements, VecEltTy, IC) && + CollectInsertionElements(I->getOperand(1), Shift, + Elements, VecEltTy, IC); case Instruction::Shl: { // Must be shifting by a constant that is a multiple of the element size. ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1)); if (CI == 0) return false; - if (!isMultipleOfTypeSize(CI->getZExtValue(), VecEltTy)) return false; - unsigned IndexShift = getTypeSizeIndex(CI->getZExtValue(), VecEltTy); - - return CollectInsertionElements(I->getOperand(0), ElementIndex+IndexShift, - Elements, VecEltTy); + Shift += CI->getZExtValue(); + if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false; + return CollectInsertionElements(I->getOperand(0), Shift, + Elements, VecEltTy, IC); } } @@ -1584,12 +1638,15 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex, /// Into two insertelements that do "buildvector{%inc, %inc5}". static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI, InstCombiner &IC) { + // We need to know the target byte order to perform this optimization. + if (!IC.getDataLayout()) return 0; + VectorType *DestVecTy = cast<VectorType>(CI.getType()); Value *IntInput = CI.getOperand(0); SmallVector<Value*, 8> Elements(DestVecTy->getNumElements()); if (!CollectInsertionElements(IntInput, 0, Elements, - DestVecTy->getElementType())) + DestVecTy->getElementType(), IC)) return 0; // If we succeeded, we know that all of the element are specified by Elements @@ -1775,10 +1832,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { // Okay, we have (bitcast (shuffle ..)). Check to see if this is // a bitcast to a vector with the same # elts. if (SVI->hasOneUse() && DestTy->isVectorTy() && - cast<VectorType>(DestTy)->getNumElements() == - SVI->getType()->getNumElements() && + DestTy->getVectorNumElements() == SVI->getType()->getNumElements() && SVI->getType()->getNumElements() == - cast<VectorType>(SVI->getOperand(0)->getType())->getNumElements()) { + SVI->getOperand(0)->getType()->getVectorNumElements()) { BitCastInst *Tmp; // If either of the operands is a cast from CI.getType(), then // evaluating the shuffle in the casted destination's type will allow @@ -1800,3 +1856,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { return commonPointerCastTransforms(CI); return commonCastTransforms(CI); } + +Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) { + return commonCastTransforms(CI); +} diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 4c252c0..9bb65ef 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -227,7 +227,8 @@ Instruction *InstCombiner:: FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, CmpInst &ICI, ConstantInt *AndCst) { // We need TD information to know the pointer size unless this is inbounds. - if (!GEP->isInBounds() && TD == 0) return 0; + if (!GEP->isInBounds() && TD == 0) + return 0; Constant *Init = GV->getInitializer(); if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init)) @@ -393,16 +394,19 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, // If the index is larger than the pointer size of the target, truncate the // index down like the GEP would do implicitly. We don't have to do this for // an inbounds GEP because the index can't be out of range. - if (!GEP->isInBounds() && - Idx->getType()->getPrimitiveSizeInBits() > TD->getPointerSizeInBits()) - Idx = Builder->CreateTrunc(Idx, TD->getIntPtrType(Idx->getContext())); + if (!GEP->isInBounds()) { + Type *IntPtrTy = TD->getIntPtrType(GEP->getType()); + unsigned PtrSize = IntPtrTy->getIntegerBitWidth(); + if (Idx->getType()->getPrimitiveSizeInBits() > PtrSize) + Idx = Builder->CreateTrunc(Idx, IntPtrTy); + } // If the comparison is only true for one or two elements, emit direct // comparisons. if (SecondTrueElement != Overdefined) { // None true -> false. if (FirstTrueElement == Undefined) - return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(GEP->getContext())); + return ReplaceInstUsesWith(ICI, Builder->getFalse()); Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement); @@ -422,7 +426,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, if (SecondFalseElement != Overdefined) { // None false -> true. if (FirstFalseElement == Undefined) - return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(GEP->getContext())); + return ReplaceInstUsesWith(ICI, Builder->getTrue()); Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement); @@ -562,16 +566,18 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) { } } + + // Okay, we know we have a single variable index, which must be a // pointer/array/vector index. If there is no offset, life is simple, return // the index. - unsigned IntPtrWidth = TD.getPointerSizeInBits(); + Type *IntPtrTy = TD.getIntPtrType(GEP->getOperand(0)->getType()); + unsigned IntPtrWidth = IntPtrTy->getIntegerBitWidth(); if (Offset == 0) { // Cast to intptrty in case a truncation occurs. If an extension is needed, // we don't need to bother extending: the extension won't affect where the // computation crosses zero. if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) { - Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext()); VariableIdx = IC.Builder->CreateTrunc(VariableIdx, IntPtrTy); } return VariableIdx; @@ -593,7 +599,6 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) { return 0; // Okay, we can do this evaluation. Start by converting the index to intptr. - Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext()); if (VariableIdx->getType() != IntPtrTy) VariableIdx = IC.Builder->CreateIntCast(VariableIdx, IntPtrTy, true /*Signed*/); @@ -647,8 +652,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, // If all indices are the same, just compare the base pointers. if (IndicesTheSame) - return new ICmpInst(ICmpInst::getSignedPredicate(Cond), - GEPLHS->getOperand(0), GEPRHS->getOperand(0)); + return new ICmpInst(Cond, GEPLHS->getOperand(0), GEPRHS->getOperand(0)); // If we're comparing GEPs with two base pointers that only differ in type // and both GEPs have only constant indices or just one use, then fold @@ -679,7 +683,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, } if (AllZeros) return FoldGEPICmp(GEPRHS, GEPLHS->getOperand(0), - ICmpInst::getSwappedPredicate(Cond), I); + ICmpInst::getSwappedPredicate(Cond), I); // If the other GEP has all zero indices, recurse. AllZeros = true; @@ -712,8 +716,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, if (NumDifferences == 0) // SAME GEP? return ReplaceInstUsesWith(I, // No comparison is needed here. - ConstantInt::get(Type::getInt1Ty(I.getContext()), - ICmpInst::isTrueWhenEqual(Cond))); + Builder->getInt1(ICmpInst::isTrueWhenEqual(Cond))); else if (NumDifferences == 1 && GEPsInBounds) { Value *LHSV = GEPLHS->getOperand(DiffOperand); @@ -739,10 +742,9 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, } /// FoldICmpAddOpCst - Fold "icmp pred (X+CI), X". -Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI, +Instruction *InstCombiner::FoldICmpAddOpCst(Instruction &ICI, Value *X, ConstantInt *CI, - ICmpInst::Predicate Pred, - Value *TheAdd) { + ICmpInst::Predicate Pred) { // If we have X+0, exit early (simplifying logic below) and let it get folded // elsewhere. icmp X+0, X -> icmp X, X if (CI->isZero()) { @@ -752,11 +754,11 @@ Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI, // (X+4) == X -> false. if (Pred == ICmpInst::ICMP_EQ) - return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(X->getContext())); + return ReplaceInstUsesWith(ICI, Builder->getFalse()); // (X+4) != X -> true. if (Pred == ICmpInst::ICMP_NE) - return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(X->getContext())); + return ReplaceInstUsesWith(ICI, Builder->getTrue()); // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0, // so the values can never be equal. Similarly for all other "or equals" @@ -798,7 +800,7 @@ Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI, // (X+ -1) >s X --> X <s (MAXSINT-(-1-1)) --> X == -128 assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE); - Constant *C = ConstantInt::get(X->getContext(), CI->getValue()-1); + Constant *C = Builder->getInt(CI->getValue()-1); return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantExpr::getSub(SMax, C)); } @@ -921,7 +923,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI, default: llvm_unreachable("Unhandled icmp opcode!"); case ICmpInst::ICMP_EQ: if (LoOverflow && HiOverflow) - return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext())); + return ReplaceInstUsesWith(ICI, Builder->getFalse()); if (HiOverflow) return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE, X, LoBound); @@ -932,7 +934,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI, DivIsSigned, true)); case ICmpInst::ICMP_NE: if (LoOverflow && HiOverflow) - return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext())); + return ReplaceInstUsesWith(ICI, Builder->getTrue()); if (HiOverflow) return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, X, LoBound); @@ -944,16 +946,16 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI, case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_SLT: if (LoOverflow == +1) // Low bound is greater than input range. - return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext())); + return ReplaceInstUsesWith(ICI, Builder->getTrue()); if (LoOverflow == -1) // Low bound is less than input range. - return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext())); + return ReplaceInstUsesWith(ICI, Builder->getFalse()); return new ICmpInst(Pred, X, LoBound); case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_SGT: if (HiOverflow == +1) // High bound greater than input range. - return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext())); + return ReplaceInstUsesWith(ICI, Builder->getFalse()); if (HiOverflow == -1) // High bound less than input range. - return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext())); + return ReplaceInstUsesWith(ICI, Builder->getTrue()); if (Pred == ICmpInst::ICMP_UGT) return new ICmpInst(ICmpInst::ICMP_UGE, X, HiBound); return new ICmpInst(ICmpInst::ICMP_SGE, X, HiBound); @@ -1017,7 +1019,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr, // If we are comparing against bits always shifted out, the // comparison cannot succeed. APInt Comp = CmpRHSV << ShAmtVal; - ConstantInt *ShiftedCmpRHS = ConstantInt::get(ICI.getContext(), Comp); + ConstantInt *ShiftedCmpRHS = Builder->getInt(Comp); if (Shr->getOpcode() == Instruction::LShr) Comp = Comp.lshr(ShAmtVal); else @@ -1025,8 +1027,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr, if (Comp != CmpRHSV) { // Comparing against a bit that we know is zero. bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE; - Constant *Cst = ConstantInt::get(Type::getInt1Ty(ICI.getContext()), - IsICMP_NE); + Constant *Cst = Builder->getInt1(IsICMP_NE); return ReplaceInstUsesWith(ICI, Cst); } @@ -1039,7 +1040,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr, if (Shr->hasOneUse()) { // Otherwise strength reduce the shift into an and. APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal)); - Constant *Mask = ConstantInt::get(ICI.getContext(), Val); + Constant *Mask = Builder->getInt(Val); Value *And = Builder->CreateAnd(Shr->getOperand(0), Mask, Shr->getName()+".mask"); @@ -1072,7 +1073,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, APInt NewRHS = RHS->getValue().zext(SrcBits); NewRHS |= KnownOne & APInt::getHighBitsSet(SrcBits, SrcBits-DstBits); return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0), - ConstantInt::get(ICI.getContext(), NewRHS)); + Builder->getInt(NewRHS)); } } break; @@ -1115,8 +1116,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, ? ICI.getUnsignedPredicate() : ICI.getSignedPredicate(); return new ICmpInst(Pred, LHSI->getOperand(0), - ConstantInt::get(ICI.getContext(), - RHSV ^ SignBit)); + Builder->getInt(RHSV ^ SignBit)); } // (icmp u/s (xor A ~SignBit), C) -> (icmp s/u (xor C ~SignBit), A) @@ -1127,10 +1127,21 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, : ICI.getSignedPredicate(); Pred = ICI.getSwappedPredicate(Pred); return new ICmpInst(Pred, LHSI->getOperand(0), - ConstantInt::get(ICI.getContext(), - RHSV ^ NotSignBit)); + Builder->getInt(RHSV ^ NotSignBit)); } } + + // (icmp ugt (xor X, C), ~C) -> (icmp ult X, C) + // iff -C is a power of 2 + if (ICI.getPredicate() == ICmpInst::ICMP_UGT && + XorCST->getValue() == ~RHSV && (RHSV + 1).isPowerOf2()) + return new ICmpInst(ICmpInst::ICMP_ULT, LHSI->getOperand(0), XorCST); + + // (icmp ult (xor X, C), -C) -> (icmp uge X, C) + // iff -C is a power of 2 + if (ICI.getPredicate() == ICmpInst::ICMP_ULT && + XorCST->getValue() == -RHSV && RHSV.isPowerOf2()) + return new ICmpInst(ICmpInst::ICMP_UGE, LHSI->getOperand(0), XorCST); } break; case Instruction::And: // (icmp pred (and X, AndCST), RHS) @@ -1187,11 +1198,16 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, Type *AndTy = AndCST->getType(); // Type of the and. // We can fold this as long as we can't shift unknown bits - // into the mask. This can only happen with signed shift - // rights, as they sign-extend. + // into the mask. This can happen with signed shift + // rights, as they sign-extend. With logical shifts, + // we must still make sure the comparison is not signed + // because we are effectively changing the + // position of the sign bit (PR17827). + // TODO: We can relax these constraints a bit more. if (ShAmt) { - bool CanFold = Shift->isLogicalShift(); - if (!CanFold) { + bool CanFold = false; + unsigned ShiftOpcode = Shift->getOpcode(); + if (ShiftOpcode == Instruction::AShr) { // To test for the bad case of the signed shr, see if any // of the bits shifted in could be tested after the mask. uint32_t TyBits = Ty->getPrimitiveSizeInBits(); @@ -1201,6 +1217,9 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, if ((APInt::getHighBitsSet(BitWidth, BitWidth-ShAmtVal) & AndCST->getValue()) == 0) CanFold = true; + } else if (ShiftOpcode == Instruction::Shl || + ShiftOpcode == Instruction::LShr) { + CanFold = !ICI.isSigned(); } if (CanFold) { @@ -1218,11 +1237,9 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, // As a special case, check to see if this means that the // result is always true or false now. if (ICI.getPredicate() == ICmpInst::ICMP_EQ) - return ReplaceInstUsesWith(ICI, - ConstantInt::getFalse(ICI.getContext())); + return ReplaceInstUsesWith(ICI, Builder->getFalse()); if (ICI.getPredicate() == ICmpInst::ICMP_NE) - return ReplaceInstUsesWith(ICI, - ConstantInt::getTrue(ICI.getContext())); + return ReplaceInstUsesWith(ICI, Builder->getTrue()); } else { ICI.setOperand(1, NewCst); Constant *NewAndCST; @@ -1284,6 +1301,15 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, return Res; } } + + // X & -C == -C -> X > u ~C + // X & -C != -C -> X <= u ~C + // iff C is a power of 2 + if (ICI.isEquality() && RHS == LHSI->getOperand(1) && (-RHSV).isPowerOf2()) + return new ICmpInst( + ICI.getPredicate() == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_UGT + : ICmpInst::ICMP_ULE, + LHSI->getOperand(0), SubOne(RHS)); break; case Instruction::Or: { @@ -1325,10 +1351,80 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, } case Instruction::Shl: { // (icmp pred (shl X, ShAmt), CI) + uint32_t TypeBits = RHSV.getBitWidth(); ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1)); - if (!ShAmt) break; + if (!ShAmt) { + Value *X; + // (1 << X) pred P2 -> X pred Log2(P2) + if (match(LHSI, m_Shl(m_One(), m_Value(X)))) { + bool RHSVIsPowerOf2 = RHSV.isPowerOf2(); + ICmpInst::Predicate Pred = ICI.getPredicate(); + if (ICI.isUnsigned()) { + if (!RHSVIsPowerOf2) { + // (1 << X) < 30 -> X <= 4 + // (1 << X) <= 30 -> X <= 4 + // (1 << X) >= 30 -> X > 4 + // (1 << X) > 30 -> X > 4 + if (Pred == ICmpInst::ICMP_ULT) + Pred = ICmpInst::ICMP_ULE; + else if (Pred == ICmpInst::ICMP_UGE) + Pred = ICmpInst::ICMP_UGT; + } + unsigned RHSLog2 = RHSV.logBase2(); + + // (1 << X) >= 2147483648 -> X >= 31 -> X == 31 + // (1 << X) > 2147483648 -> X > 31 -> false + // (1 << X) <= 2147483648 -> X <= 31 -> true + // (1 << X) < 2147483648 -> X < 31 -> X != 31 + if (RHSLog2 == TypeBits-1) { + if (Pred == ICmpInst::ICMP_UGE) + Pred = ICmpInst::ICMP_EQ; + else if (Pred == ICmpInst::ICMP_UGT) + return ReplaceInstUsesWith(ICI, Builder->getFalse()); + else if (Pred == ICmpInst::ICMP_ULE) + return ReplaceInstUsesWith(ICI, Builder->getTrue()); + else if (Pred == ICmpInst::ICMP_ULT) + Pred = ICmpInst::ICMP_NE; + } - uint32_t TypeBits = RHSV.getBitWidth(); + return new ICmpInst(Pred, X, + ConstantInt::get(RHS->getType(), RHSLog2)); + } else if (ICI.isSigned()) { + if (RHSV.isAllOnesValue()) { + // (1 << X) <= -1 -> X == 31 + if (Pred == ICmpInst::ICMP_SLE) + return new ICmpInst(ICmpInst::ICMP_EQ, X, + ConstantInt::get(RHS->getType(), TypeBits-1)); + + // (1 << X) > -1 -> X != 31 + if (Pred == ICmpInst::ICMP_SGT) + return new ICmpInst(ICmpInst::ICMP_NE, X, + ConstantInt::get(RHS->getType(), TypeBits-1)); + } else if (!RHSV) { + // (1 << X) < 0 -> X == 31 + // (1 << X) <= 0 -> X == 31 + if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE) + return new ICmpInst(ICmpInst::ICMP_EQ, X, + ConstantInt::get(RHS->getType(), TypeBits-1)); + + // (1 << X) >= 0 -> X != 31 + // (1 << X) > 0 -> X != 31 + if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE) + return new ICmpInst(ICmpInst::ICMP_NE, X, + ConstantInt::get(RHS->getType(), TypeBits-1)); + } + } else if (ICI.isEquality()) { + if (RHSVIsPowerOf2) + return new ICmpInst( + Pred, X, ConstantInt::get(RHS->getType(), RHSV.logBase2())); + + return ReplaceInstUsesWith( + ICI, Pred == ICmpInst::ICMP_EQ ? Builder->getFalse() + : Builder->getTrue()); + } + } + break; + } // Check that the shift amount is in range. If not, don't perform // undefined shifts. When the shift is visited it will be @@ -1344,8 +1440,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, ShAmt); if (Comp != RHS) {// Comparing against a bit that we know is zero. bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE; - Constant *Cst = - ConstantInt::get(Type::getInt1Ty(ICI.getContext()), IsICMP_NE); + Constant *Cst = Builder->getInt1(IsICMP_NE); return ReplaceInstUsesWith(ICI, Cst); } @@ -1364,9 +1459,8 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, if (LHSI->hasOneUse()) { // Otherwise strength reduce the shift into an and. uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits); - Constant *Mask = - ConstantInt::get(ICI.getContext(), APInt::getLowBitsSet(TypeBits, - TypeBits-ShAmtVal)); + Constant *Mask = Builder->getInt(APInt::getLowBitsSet(TypeBits, + TypeBits - ShAmtVal)); Value *And = Builder->CreateAnd(LHSI->getOperand(0),Mask, LHSI->getName()+".mask"); @@ -1451,6 +1545,30 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, return R; break; + case Instruction::Sub: { + ConstantInt *LHSC = dyn_cast<ConstantInt>(LHSI->getOperand(0)); + if (!LHSC) break; + const APInt &LHSV = LHSC->getValue(); + + // C1-X <u C2 -> (X|(C2-1)) == C1 + // iff C1 & (C2-1) == C2-1 + // C2 is a power of 2 + if (ICI.getPredicate() == ICmpInst::ICMP_ULT && LHSI->hasOneUse() && + RHSV.isPowerOf2() && (LHSV & (RHSV - 1)) == (RHSV - 1)) + return new ICmpInst(ICmpInst::ICMP_EQ, + Builder->CreateOr(LHSI->getOperand(1), RHSV - 1), + LHSC); + + // C1-X >u C2 -> (X|C2) != C1 + // iff C1 & C2 == C2 + // C2+1 is a power of 2 + if (ICI.getPredicate() == ICmpInst::ICMP_UGT && LHSI->hasOneUse() && + (RHSV + 1).isPowerOf2() && (LHSV & RHSV) == RHSV) + return new ICmpInst(ICmpInst::ICMP_NE, + Builder->CreateOr(LHSI->getOperand(1), RHSV), LHSC); + break; + } + case Instruction::Add: // Fold: icmp pred (add X, C1), C2 if (!ICI.isEquality()) { @@ -1464,20 +1582,38 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, if (ICI.isSigned()) { if (CR.getLower().isSignBit()) { return new ICmpInst(ICmpInst::ICMP_SLT, LHSI->getOperand(0), - ConstantInt::get(ICI.getContext(),CR.getUpper())); + Builder->getInt(CR.getUpper())); } else if (CR.getUpper().isSignBit()) { return new ICmpInst(ICmpInst::ICMP_SGE, LHSI->getOperand(0), - ConstantInt::get(ICI.getContext(),CR.getLower())); + Builder->getInt(CR.getLower())); } } else { if (CR.getLower().isMinValue()) { return new ICmpInst(ICmpInst::ICMP_ULT, LHSI->getOperand(0), - ConstantInt::get(ICI.getContext(),CR.getUpper())); + Builder->getInt(CR.getUpper())); } else if (CR.getUpper().isMinValue()) { return new ICmpInst(ICmpInst::ICMP_UGE, LHSI->getOperand(0), - ConstantInt::get(ICI.getContext(),CR.getLower())); + Builder->getInt(CR.getLower())); } } + + // X-C1 <u C2 -> (X & -C2) == C1 + // iff C1 & (C2-1) == 0 + // C2 is a power of 2 + if (ICI.getPredicate() == ICmpInst::ICMP_ULT && LHSI->hasOneUse() && + RHSV.isPowerOf2() && (LHSV & (RHSV - 1)) == 0) + return new ICmpInst(ICmpInst::ICMP_EQ, + Builder->CreateAnd(LHSI->getOperand(0), -RHSV), + ConstantExpr::getNeg(LHSC)); + + // X-C1 >u C2 -> (X & ~C2) != C1 + // iff C1 & C2 == 0 + // C2+1 is a power of 2 + if (ICI.getPredicate() == ICmpInst::ICMP_UGT && LHSI->hasOneUse() && + (RHSV + 1).isPowerOf2() && (LHSV & RHSV) == 0) + return new ICmpInst(ICmpInst::ICMP_NE, + Builder->CreateAnd(LHSI->getOperand(0), ~RHSV), + ConstantExpr::getNeg(LHSC)); } break; } @@ -1555,9 +1691,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) { Constant *NotCI = ConstantExpr::getNot(RHS); if (!ConstantExpr::getAnd(BOC, NotCI)->isNullValue()) - return ReplaceInstUsesWith(ICI, - ConstantInt::get(Type::getInt1Ty(ICI.getContext()), - isICMP_NE)); + return ReplaceInstUsesWith(ICI, Builder->getInt1(isICMP_NE)); } break; @@ -1566,9 +1700,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, // If bits are being compared against that are and'd out, then the // comparison can never succeed! if ((RHSV & ~BOC->getValue()) != 0) - return ReplaceInstUsesWith(ICI, - ConstantInt::get(Type::getInt1Ty(ICI.getContext()), - isICMP_NE)); + return ReplaceInstUsesWith(ICI, Builder->getInt1(isICMP_NE)); // If we have ((X & C) == C), turn it into ((X & C) != 0). if (RHS == BOC && RHSV.isPowerOf2()) @@ -1619,7 +1751,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, case Intrinsic::bswap: Worklist.Add(II); ICI.setOperand(0, II->getArgOperand(0)); - ICI.setOperand(1, ConstantInt::get(II->getContext(), RHSV.byteSwap())); + ICI.setOperand(1, Builder->getInt(RHSV.byteSwap())); return &ICI; case Intrinsic::ctlz: case Intrinsic::cttz: @@ -1661,8 +1793,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) { // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the // integer type is the same size as the pointer type. if (TD && LHSCI->getOpcode() == Instruction::PtrToInt && - TD->getPointerSizeInBits() == - cast<IntegerType>(DestTy)->getBitWidth()) { + TD->getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) { Value *RHSOp = 0; if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) { RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy); @@ -1915,14 +2046,59 @@ static APInt DemandedBitsLHSMask(ICmpInst &I, } +/// \brief Check if the order of \p Op0 and \p Op1 as operand in an ICmpInst +/// should be swapped. +/// The descision is based on how many times these two operands are reused +/// as subtract operands and their positions in those instructions. +/// The rational is that several architectures use the same instruction for +/// both subtract and cmp, thus it is better if the order of those operands +/// match. +/// \return true if Op0 and Op1 should be swapped. +static bool swapMayExposeCSEOpportunities(const Value * Op0, + const Value * Op1) { + // Filter out pointer value as those cannot appears directly in subtract. + // FIXME: we may want to go through inttoptrs or bitcasts. + if (Op0->getType()->isPointerTy()) + return false; + // Count every uses of both Op0 and Op1 in a subtract. + // Each time Op0 is the first operand, count -1: swapping is bad, the + // subtract has already the same layout as the compare. + // Each time Op0 is the second operand, count +1: swapping is good, the + // subtract has a diffrent layout as the compare. + // At the end, if the benefit is greater than 0, Op0 should come second to + // expose more CSE opportunities. + int GlobalSwapBenefits = 0; + for (Value::const_use_iterator UI = Op0->use_begin(), UIEnd = Op0->use_end(); UI != UIEnd; ++UI) { + const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(*UI); + if (!BinOp || BinOp->getOpcode() != Instruction::Sub) + continue; + // If Op0 is the first argument, this is not beneficial to swap the + // arguments. + int LocalSwapBenefits = -1; + unsigned Op1Idx = 1; + if (BinOp->getOperand(Op1Idx) == Op0) { + Op1Idx = 0; + LocalSwapBenefits = 1; + } + if (BinOp->getOperand(Op1Idx) != Op1) + continue; + GlobalSwapBenefits += LocalSwapBenefits; + } + return GlobalSwapBenefits > 0; +} + Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { bool Changed = false; Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + unsigned Op0Cplxity = getComplexity(Op0); + unsigned Op1Cplxity = getComplexity(Op1); /// Orders the operands of the compare so that they are listed from most /// complex to least complex. This puts constants before unary operators, /// before binary operators. - if (getComplexity(Op0) < getComplexity(Op1)) { + if (Op0Cplxity < Op1Cplxity || + (Op0Cplxity == Op1Cplxity && + swapMayExposeCSEOpportunities(Op0, Op1))) { I.swapOperands(); std::swap(Op0, Op1); Changed = true; @@ -2041,19 +2217,19 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { case ICmpInst::ICMP_ULE: assert(!CI->isMaxValue(false)); // A <=u MAX -> TRUE return new ICmpInst(ICmpInst::ICMP_ULT, Op0, - ConstantInt::get(CI->getContext(), CI->getValue()+1)); + Builder->getInt(CI->getValue()+1)); case ICmpInst::ICMP_SLE: assert(!CI->isMaxValue(true)); // A <=s MAX -> TRUE return new ICmpInst(ICmpInst::ICMP_SLT, Op0, - ConstantInt::get(CI->getContext(), CI->getValue()+1)); + Builder->getInt(CI->getValue()+1)); case ICmpInst::ICMP_UGE: assert(!CI->isMinValue(false)); // A >=u MIN -> TRUE return new ICmpInst(ICmpInst::ICMP_UGT, Op0, - ConstantInt::get(CI->getContext(), CI->getValue()-1)); + Builder->getInt(CI->getValue()-1)); case ICmpInst::ICMP_SGE: assert(!CI->isMinValue(true)); // A >=s MIN -> TRUE return new ICmpInst(ICmpInst::ICMP_SGT, Op0, - ConstantInt::get(CI->getContext(), CI->getValue()-1)); + Builder->getInt(CI->getValue()-1)); } // If this comparison is a normal comparison, it demands all @@ -2192,7 +2368,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { if (Op1Max == Op0Min+1) // A <u C -> A == C-1 if min(A)+1 == C return new ICmpInst(ICmpInst::ICMP_EQ, Op0, - ConstantInt::get(CI->getContext(), CI->getValue()-1)); + Builder->getInt(CI->getValue()-1)); // (x <u 2147483648) -> (x >s -1) -> true if sign bit clear if (CI->isMinValue(true)) @@ -2211,7 +2387,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { if (Op1Min == Op0Max-1) // A >u C -> A == C+1 if max(a)-1 == C return new ICmpInst(ICmpInst::ICMP_EQ, Op0, - ConstantInt::get(CI->getContext(), CI->getValue()+1)); + Builder->getInt(CI->getValue()+1)); // (x >u 2147483647) -> (x <s 0) -> true if sign bit set if (CI->isMaxValue(true)) @@ -2229,7 +2405,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { if (Op1Max == Op0Min+1) // A <s C -> A == C-1 if min(A)+1 == C return new ICmpInst(ICmpInst::ICMP_EQ, Op0, - ConstantInt::get(CI->getContext(), CI->getValue()-1)); + Builder->getInt(CI->getValue()-1)); } break; case ICmpInst::ICMP_SGT: @@ -2243,7 +2419,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { if (Op1Min == Op0Max-1) // A >s C -> A == C+1 if max(A)-1 == C return new ICmpInst(ICmpInst::ICMP_EQ, Op0, - ConstantInt::get(CI->getContext(), CI->getValue()+1)); + Builder->getInt(CI->getValue()+1)); } break; case ICmpInst::ICMP_SGE: @@ -2357,7 +2533,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { case Instruction::IntToPtr: // icmp pred inttoptr(X), null -> icmp pred X, 0 if (RHSC->isNullValue() && TD && - TD->getIntPtrType(RHSC->getContext()) == + TD->getIntPtrType(RHSC->getType()) == LHSI->getOperand(0)->getType()) return new ICmpInst(I.getPredicate(), LHSI->getOperand(0), Constant::getNullValue(LHSI->getOperand(0)->getType())); @@ -2719,8 +2895,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { ConstantInt *C1, *C2; if (match(B, m_ConstantInt(C1)) && match(D, m_ConstantInt(C2)) && Op1->hasOneUse()) { - Constant *NC = ConstantInt::get(I.getContext(), - C1->getValue() ^ C2->getValue()); + Constant *NC = Builder->getInt(C1->getValue() ^ C2->getValue()); Value *Xor = Builder->CreateXor(C, NC); return new ICmpInst(I.getPredicate(), A, Xor); } @@ -2781,6 +2956,24 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { Builder->CreateTrunc(B, A->getType())); } + // (A >> C) == (B >> C) --> (A^B) u< (1 << C) + // For lshr and ashr pairs. + if ((match(Op0, m_OneUse(m_LShr(m_Value(A), m_ConstantInt(Cst1)))) && + match(Op1, m_OneUse(m_LShr(m_Value(B), m_Specific(Cst1))))) || + (match(Op0, m_OneUse(m_AShr(m_Value(A), m_ConstantInt(Cst1)))) && + match(Op1, m_OneUse(m_AShr(m_Value(B), m_Specific(Cst1)))))) { + unsigned TypeBits = Cst1->getBitWidth(); + unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits); + if (ShAmt < TypeBits && ShAmt != 0) { + ICmpInst::Predicate Pred = I.getPredicate() == ICmpInst::ICMP_NE + ? ICmpInst::ICMP_UGE + : ICmpInst::ICMP_ULT; + Value *Xor = Builder->CreateXor(A, B, I.getName() + ".unshifted"); + APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt); + return new ICmpInst(Pred, Xor, Builder->getInt(CmpVal)); + } + } + // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to // "icmp (and X, mask), cst" uint64_t ShAmt = 0; @@ -2811,20 +3004,15 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { Value *X; ConstantInt *Cst; // icmp X+Cst, X if (match(Op0, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op1 == X) - return FoldICmpAddOpCst(I, X, Cst, I.getPredicate(), Op0); + return FoldICmpAddOpCst(I, X, Cst, I.getPredicate()); // icmp X, X+Cst if (match(Op1, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op0 == X) - return FoldICmpAddOpCst(I, X, Cst, I.getSwappedPredicate(), Op1); + return FoldICmpAddOpCst(I, X, Cst, I.getSwappedPredicate()); } return Changed ? &I : 0; } - - - - - /// FoldFCmp_IntToFP_Cst - Fold fcmp ([us]itofp x, cst) if possible. /// Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, @@ -2885,9 +3073,9 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, Pred = ICmpInst::ICMP_NE; break; case FCmpInst::FCMP_ORD: - return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getTrue()); case FCmpInst::FCMP_UNO: - return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getFalse()); } IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType()); @@ -2901,50 +3089,50 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, if (!LHSUnsigned) { // If the RHS value is > SignedMax, fold the comparison. This handles +INF // and large values. - APFloat SMax(RHS.getSemantics(), APFloat::fcZero, false); + APFloat SMax(RHS.getSemantics()); SMax.convertFromAPInt(APInt::getSignedMaxValue(IntWidth), true, APFloat::rmNearestTiesToEven); if (SMax.compare(RHS) == APFloat::cmpLessThan) { // smax < 13123.0 if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE) - return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); - return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getTrue()); + return ReplaceInstUsesWith(I, Builder->getFalse()); } } else { // If the RHS value is > UnsignedMax, fold the comparison. This handles // +INF and large values. - APFloat UMax(RHS.getSemantics(), APFloat::fcZero, false); + APFloat UMax(RHS.getSemantics()); UMax.convertFromAPInt(APInt::getMaxValue(IntWidth), false, APFloat::rmNearestTiesToEven); if (UMax.compare(RHS) == APFloat::cmpLessThan) { // umax < 13123.0 if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) - return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); - return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getTrue()); + return ReplaceInstUsesWith(I, Builder->getFalse()); } } if (!LHSUnsigned) { // See if the RHS value is < SignedMin. - APFloat SMin(RHS.getSemantics(), APFloat::fcZero, false); + APFloat SMin(RHS.getSemantics()); SMin.convertFromAPInt(APInt::getSignedMinValue(IntWidth), true, APFloat::rmNearestTiesToEven); if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // smin > 12312.0 if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE) - return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); - return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getTrue()); + return ReplaceInstUsesWith(I, Builder->getFalse()); } } else { // See if the RHS value is < UnsignedMin. - APFloat SMin(RHS.getSemantics(), APFloat::fcZero, false); + APFloat SMin(RHS.getSemantics()); SMin.convertFromAPInt(APInt::getMinValue(IntWidth), true, APFloat::rmNearestTiesToEven); if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // umin > 12312.0 if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE) - return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); - return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getTrue()); + return ReplaceInstUsesWith(I, Builder->getFalse()); } } @@ -2966,14 +3154,14 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, switch (Pred) { default: llvm_unreachable("Unexpected integer comparison!"); case ICmpInst::ICMP_NE: // (float)int != 4.4 --> true - return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getTrue()); case ICmpInst::ICMP_EQ: // (float)int == 4.4 --> false - return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getFalse()); case ICmpInst::ICMP_ULE: // (float)int <= 4.4 --> int <= 4 // (float)int <= -4.4 --> false if (RHS.isNegative()) - return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getFalse()); break; case ICmpInst::ICMP_SLE: // (float)int <= 4.4 --> int <= 4 @@ -2985,7 +3173,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, // (float)int < -4.4 --> false // (float)int < 4.4 --> int <= 4 if (RHS.isNegative()) - return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getFalse()); Pred = ICmpInst::ICMP_ULE; break; case ICmpInst::ICMP_SLT: @@ -2998,7 +3186,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, // (float)int > 4.4 --> int > 4 // (float)int > -4.4 --> true if (RHS.isNegative()) - return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getTrue()); break; case ICmpInst::ICMP_SGT: // (float)int > 4.4 --> int > 4 @@ -3010,7 +3198,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, // (float)int >= -4.4 --> true // (float)int >= 4.4 --> int > 4 if (RHS.isNegative()) - return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getTrue()); Pred = ICmpInst::ICMP_UGT; break; case ICmpInst::ICMP_SGE: diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index e2d7966..4c861b3 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -154,7 +154,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // Ensure that the alloca array size argument has type intptr_t, so that // any casting is exposed early. if (TD) { - Type *IntPtrTy = TD->getIntPtrType(AI.getContext()); + Type *IntPtrTy = TD->getIntPtrType(AI.getType()); if (AI.getArraySize()->getType() != IntPtrTy) { Value *V = Builder->CreateIntCast(AI.getArraySize(), IntPtrTy, false); @@ -180,12 +180,13 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // Now that I is pointing to the first non-allocation-inst in the block, // insert our getelementptr instruction... // - Value *NullIdx =Constant::getNullValue(Type::getInt32Ty(AI.getContext())); - Value *Idx[2]; - Idx[0] = NullIdx; - Idx[1] = NullIdx; + Type *IdxTy = TD + ? TD->getIntPtrType(AI.getType()) + : Type::getInt64Ty(AI.getContext()); + Value *NullIdx = Constant::getNullValue(IdxTy); + Value *Idx[2] = { NullIdx, NullIdx }; Instruction *GEP = - GetElementPtrInst::CreateInBounds(New, Idx, New->getName()+".sub"); + GetElementPtrInst::CreateInBounds(New, Idx, New->getName() + ".sub"); InsertNewInstBefore(GEP, *It); // Now make everything use the getelementptr instead of the original @@ -262,9 +263,9 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { for (unsigned i = 0, e = ToDelete.size(); i != e; ++i) EraseInstFromFunction(*ToDelete[i]); Constant *TheSrc = cast<Constant>(Copy->getSource()); - Instruction *NewI - = ReplaceInstUsesWith(AI, ConstantExpr::getBitCast(TheSrc, - AI.getType())); + Constant *Cast + = ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, AI.getType()); + Instruction *NewI = ReplaceInstUsesWith(AI, Cast); EraseInstFromFunction(*Copy); ++NumGlobalCopies; return NewI; @@ -302,9 +303,11 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI, if (ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy)) if (Constant *CSrc = dyn_cast<Constant>(CastOp)) if (ASrcTy->getNumElements() != 0) { - Value *Idxs[2]; - Idxs[0] = Constant::getNullValue(Type::getInt32Ty(LI.getContext())); - Idxs[1] = Idxs[0]; + Type *IdxTy = TD + ? TD->getIntPtrType(SrcTy) + : Type::getInt64Ty(SrcTy->getContext()); + Value *Idx = Constant::getNullValue(IdxTy); + Value *Idxs[2] = { Idx, Idx }; CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs); SrcTy = cast<PointerType>(CastOp->getType()); SrcPTy = SrcTy->getElementType(); @@ -315,7 +318,8 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI, SrcPTy->isVectorTy()) && // Do not allow turning this into a load of an integer, which is then // casted to a pointer, this pessimizes pointer analysis a lot. - (SrcPTy->isPointerTy() == LI.getType()->isPointerTy()) && + (SrcPTy->isPtrOrPtrVectorTy() == + LI.getType()->isPtrOrPtrVectorTy()) && IC.getDataLayout()->getTypeSizeInBits(SrcPTy) == IC.getDataLayout()->getTypeSizeInBits(DestPTy)) { diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index ecc9fc3..a759548 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -95,6 +95,25 @@ static bool MultiplyOverflows(ConstantInt *C1, ConstantInt *C2, bool sign) { return MulExt.slt(Min) || MulExt.sgt(Max); } +/// \brief A helper routine of InstCombiner::visitMul(). +/// +/// If C is a vector of known powers of 2, then this function returns +/// a new vector obtained from C replacing each element with its logBase2. +/// Return a null pointer otherwise. +static Constant *getLogBase2Vector(ConstantDataVector *CV) { + const APInt *IVal; + SmallVector<Constant *, 4> Elts; + + for (unsigned I = 0, E = CV->getNumElements(); I != E; ++I) { + Constant *Elt = CV->getElementAsConstant(I); + if (!match(Elt, m_APInt(IVal)) || !IVal->isPowerOf2()) + return 0; + Elts.push_back(ConstantInt::get(Elt->getType(), IVal->logBase2())); + } + + return ConstantVector::get(Elts); +} + Instruction *InstCombiner::visitMul(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); @@ -108,24 +127,37 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { if (match(Op1, m_AllOnes())) // X * -1 == 0 - X return BinaryOperator::CreateNeg(Op0, I.getName()); - if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { - - // ((X << C1)*C2) == (X * (C2 << C1)) - if (BinaryOperator *SI = dyn_cast<BinaryOperator>(Op0)) - if (SI->getOpcode() == Instruction::Shl) - if (Constant *ShOp = dyn_cast<Constant>(SI->getOperand(1))) - return BinaryOperator::CreateMul(SI->getOperand(0), - ConstantExpr::getShl(CI, ShOp)); - - const APInt &Val = CI->getValue(); - if (Val.isPowerOf2()) { // Replace X*(2^C) with X << C - Constant *NewCst = ConstantInt::get(Op0->getType(), Val.logBase2()); - BinaryOperator *Shl = BinaryOperator::CreateShl(Op0, NewCst); - if (I.hasNoSignedWrap()) Shl->setHasNoSignedWrap(); - if (I.hasNoUnsignedWrap()) Shl->setHasNoUnsignedWrap(); - return Shl; + // Also allow combining multiply instructions on vectors. + { + Value *NewOp; + Constant *C1, *C2; + const APInt *IVal; + if (match(&I, m_Mul(m_Shl(m_Value(NewOp), m_Constant(C2)), + m_Constant(C1))) && + match(C1, m_APInt(IVal))) + // ((X << C1)*C2) == (X * (C2 << C1)) + return BinaryOperator::CreateMul(NewOp, ConstantExpr::getShl(C1, C2)); + + if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) { + Constant *NewCst = 0; + if (match(C1, m_APInt(IVal)) && IVal->isPowerOf2()) + // Replace X*(2^C) with X << C, where C is either a scalar or a splat. + NewCst = ConstantInt::get(NewOp->getType(), IVal->logBase2()); + else if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(C1)) + // Replace X*(2^C) with X << C, where C is a vector of known + // constant powers of 2. + NewCst = getLogBase2Vector(CV); + + if (NewCst) { + BinaryOperator *Shl = BinaryOperator::CreateShl(NewOp, NewCst); + if (I.hasNoSignedWrap()) Shl->setHasNoSignedWrap(); + if (I.hasNoUnsignedWrap()) Shl->setHasNoUnsignedWrap(); + return Shl; + } } + } + if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { // Canonicalize (X+C1)*CI -> X*CI+C1*CI. { Value *X; ConstantInt *C1; if (Op0->hasOneUse() && @@ -306,13 +338,13 @@ static bool isFMulOrFDivWithConstant(Value *V) { if (C0 && C1) return false; - return (C0 && C0->getValueAPF().isNormal()) || - (C1 && C1->getValueAPF().isNormal()); + return (C0 && C0->getValueAPF().isFiniteNonZero()) || + (C1 && C1->getValueAPF().isFiniteNonZero()); } static bool isNormalFp(const ConstantFP *C) { const APFloat &Flt = C->getValueAPF(); - return Flt.isNormal() && !Flt.isDenormal(); + return Flt.isNormal(); } /// foldFMulConst() is a helper routine of InstCombiner::visitFMul(). @@ -342,9 +374,12 @@ Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, ConstantFP *C, } else { if (C0) { // (C0 / X) * C => (C0 * C) / X - ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFMul(C0, C)); - if (isNormalFp(F)) - R = BinaryOperator::CreateFDiv(F, Opnd1); + if (FMulOrDiv->hasOneUse()) { + // It would otherwise introduce another div. + ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFMul(C0, C)); + if (isNormalFp(F)) + R = BinaryOperator::CreateFDiv(F, Opnd1); + } } else { // (X / C1) * C => X * (C/C1) if C/C1 is not a denormal ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFDiv(C, C1)); @@ -391,7 +426,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { return NV; ConstantFP *C = dyn_cast<ConstantFP>(Op1); - if (C && AllowReassociate && C->getValueAPF().isNormal()) { + if (C && AllowReassociate && C->getValueAPF().isFiniteNonZero()) { // Let MDC denote an expression in one of these forms: // X * C, C/X, X/C, where C is a constant. // @@ -418,7 +453,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { Swap = true; } - if (C1 && C1->getValueAPF().isNormal() && + if (C1 && C1->getValueAPF().isFiniteNonZero() && isFMulOrFDivWithConstant(Opnd0)) { Value *M1 = ConstantExpr::getFMul(C1, C); Value *M0 = isNormalFp(cast<ConstantFP>(M1)) ? @@ -428,10 +463,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { if (Swap && FAddSub->getOpcode() == Instruction::FSub) std::swap(M0, M1); - Value *R = (FAddSub->getOpcode() == Instruction::FAdd) ? - BinaryOperator::CreateFAdd(M0, M1) : - BinaryOperator::CreateFSub(M0, M1); - Instruction *RI = cast<Instruction>(R); + Instruction *RI = (FAddSub->getOpcode() == Instruction::FAdd) + ? BinaryOperator::CreateFAdd(M0, M1) + : BinaryOperator::CreateFSub(M0, M1); RI->copyFastMathFlags(&I); return RI; } @@ -458,13 +492,13 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { } // if pattern detected emit alternate sequence if (OpX && OpY) { + BuilderTy::FastMathFlagGuard Guard(*Builder); + Builder->SetFastMathFlags(Log2->getFastMathFlags()); Log2->setArgOperand(0, OpY); Value *FMulVal = Builder->CreateFMul(OpX, Log2); - Instruction *FMul = cast<Instruction>(FMulVal); - FMul->copyFastMathFlags(Log2); - Instruction *FSub = BinaryOperator::CreateFSub(FMulVal, OpX); - FSub->copyFastMathFlags(Log2); - return FSub; + Value *FSub = Builder->CreateFSub(FMulVal, OpX); + FSub->takeName(&I); + return ReplaceInstUsesWith(I, FSub); } } @@ -474,6 +508,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { for (int i = 0; i < 2; i++) { bool IgnoreZeroSign = I.hasNoSignedZeros(); if (BinaryOperator::isFNeg(Opnd0, IgnoreZeroSign)) { + BuilderTy::FastMathFlagGuard Guard(*Builder); + Builder->SetFastMathFlags(I.getFastMathFlags()); + Value *N0 = dyn_castFNegVal(Opnd0, IgnoreZeroSign); Value *N1 = dyn_castFNegVal(Opnd1, IgnoreZeroSign); @@ -484,13 +521,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { if (Opnd0->hasOneUse()) { // -X * Y => -(X*Y) (Promote negation as high as possible) Value *T = Builder->CreateFMul(N0, Opnd1); - cast<Instruction>(T)->setDebugLoc(I.getDebugLoc()); - Instruction *Neg = BinaryOperator::CreateFNeg(T); - if (I.getFastMathFlags().any()) { - cast<Instruction>(T)->copyFastMathFlags(&I); - Neg->copyFastMathFlags(&I); - } - return Neg; + Value *Neg = Builder->CreateFNeg(T); + Neg->takeName(&I); + return ReplaceInstUsesWith(I, Neg); } } @@ -513,13 +546,13 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { Y = Opnd0_0; if (Y) { - Instruction *T = cast<Instruction>(Builder->CreateFMul(Opnd1, Opnd1)); - T->copyFastMathFlags(&I); - T->setDebugLoc(I.getDebugLoc()); + BuilderTy::FastMathFlagGuard Guard(*Builder); + Builder->SetFastMathFlags(I.getFastMathFlags()); + Value *T = Builder->CreateFMul(Opnd1, Opnd1); - Instruction *R = BinaryOperator::CreateFMul(T, Y); - R->copyFastMathFlags(&I); - return R; + Value *R = Builder->CreateFMul(T, Y); + R->takeName(&I); + return ReplaceInstUsesWith(I, R); } } } @@ -528,10 +561,10 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { if (I.hasNoNaNs() && I.hasNoInfs() && I.hasNoSignedZeros()) { Value *LHS = Op0, *RHS = Op1; Value *B, *C; - if (!match(RHS, m_UIToFp(m_Value(C)))) + if (!match(RHS, m_UIToFP(m_Value(C)))) std::swap(LHS, RHS); - if (match(RHS, m_UIToFp(m_Value(C))) && C->getType()->isIntegerTy(1)) { + if (match(RHS, m_UIToFP(m_Value(C))) && C->getType()->isIntegerTy(1)) { B = LHS; Value *Zero = ConstantFP::getNegativeZero(B->getType()); return SelectInst::Create(C, B, Zero); @@ -542,10 +575,10 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { if (I.hasNoNaNs() && I.hasNoInfs() && I.hasNoSignedZeros()) { Value *LHS = Op0, *RHS = Op1; Value *A, *C; - if (!match(RHS, m_FSub(m_FPOne(), m_UIToFp(m_Value(C))))) + if (!match(RHS, m_FSub(m_FPOne(), m_UIToFP(m_Value(C))))) std::swap(LHS, RHS); - if (match(RHS, m_FSub(m_FPOne(), m_UIToFp(m_Value(C)))) && + if (match(RHS, m_FSub(m_FPOne(), m_UIToFP(m_Value(C)))) && C->getType()->isIntegerTy(1)) { A = LHS; Value *Zero = ConstantFP::getNegativeZero(A->getType()); @@ -613,8 +646,7 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) { *I = SI->getOperand(NonNullOperand); Worklist.Add(BBI); } else if (*I == SelectCond) { - *I = NonNullOperand == 1 ? ConstantInt::getTrue(BBI->getContext()) : - ConstantInt::getFalse(BBI->getContext()); + *I = Builder->getInt1(NonNullOperand == 1); Worklist.Add(BBI); } } @@ -703,40 +735,124 @@ static Value *dyn_castZExtVal(Value *V, Type *Ty) { return 0; } -Instruction *InstCombiner::visitUDiv(BinaryOperator &I) { - Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); +namespace { +const unsigned MaxDepth = 6; +typedef Instruction *(*FoldUDivOperandCb)(Value *Op0, Value *Op1, + const BinaryOperator &I, + InstCombiner &IC); + +/// \brief Used to maintain state for visitUDivOperand(). +struct UDivFoldAction { + FoldUDivOperandCb FoldAction; ///< Informs visitUDiv() how to fold this + ///< operand. This can be zero if this action + ///< joins two actions together. + + Value *OperandToFold; ///< Which operand to fold. + union { + Instruction *FoldResult; ///< The instruction returned when FoldAction is + ///< invoked. + + size_t SelectLHSIdx; ///< Stores the LHS action index if this action + ///< joins two actions together. + }; + + UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand) + : FoldAction(FA), OperandToFold(InputOperand), FoldResult(0) {} + UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand, size_t SLHS) + : FoldAction(FA), OperandToFold(InputOperand), SelectLHSIdx(SLHS) {} +}; +} - if (Value *V = SimplifyUDivInst(Op0, Op1, TD)) - return ReplaceInstUsesWith(I, V); +// X udiv 2^C -> X >> C +static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1, + const BinaryOperator &I, InstCombiner &IC) { + const APInt &C = cast<Constant>(Op1)->getUniqueInteger(); + BinaryOperator *LShr = BinaryOperator::CreateLShr( + Op0, ConstantInt::get(Op0->getType(), C.logBase2())); + if (I.isExact()) LShr->setIsExact(); + return LShr; +} - // Handle the integer div common cases - if (Instruction *Common = commonIDivTransforms(I)) - return Common; +// X udiv C, where C >= signbit +static Instruction *foldUDivNegCst(Value *Op0, Value *Op1, + const BinaryOperator &I, InstCombiner &IC) { + Value *ICI = IC.Builder->CreateICmpULT(Op0, cast<ConstantInt>(Op1)); - { - // X udiv 2^C -> X >> C - // Check to see if this is an unsigned division with an exact power of 2, - // if so, convert to a right shift. - const APInt *C; - if (match(Op1, m_Power2(C))) { - BinaryOperator *LShr = - BinaryOperator::CreateLShr(Op0, - ConstantInt::get(Op0->getType(), - C->logBase2())); - if (I.isExact()) LShr->setIsExact(); - return LShr; - } + return SelectInst::Create(ICI, Constant::getNullValue(I.getType()), + ConstantInt::get(I.getType(), 1)); +} + +// X udiv (C1 << N), where C1 is "1<<C2" --> X >> (N+C2) +static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I, + InstCombiner &IC) { + Instruction *ShiftLeft = cast<Instruction>(Op1); + if (isa<ZExtInst>(ShiftLeft)) + ShiftLeft = cast<Instruction>(ShiftLeft->getOperand(0)); + + const APInt &CI = + cast<Constant>(ShiftLeft->getOperand(0))->getUniqueInteger(); + Value *N = ShiftLeft->getOperand(1); + if (CI != 1) + N = IC.Builder->CreateAdd(N, ConstantInt::get(N->getType(), CI.logBase2())); + if (ZExtInst *Z = dyn_cast<ZExtInst>(Op1)) + N = IC.Builder->CreateZExt(N, Z->getDestTy()); + BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, N); + if (I.isExact()) LShr->setIsExact(); + return LShr; +} + +// \brief Recursively visits the possible right hand operands of a udiv +// instruction, seeing through select instructions, to determine if we can +// replace the udiv with something simpler. If we find that an operand is not +// able to simplify the udiv, we abort the entire transformation. +static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I, + SmallVectorImpl<UDivFoldAction> &Actions, + unsigned Depth = 0) { + // Check to see if this is an unsigned division with an exact power of 2, + // if so, convert to a right shift. + if (match(Op1, m_Power2())) { + Actions.push_back(UDivFoldAction(foldUDivPow2Cst, Op1)); + return Actions.size(); } - if (ConstantInt *C = dyn_cast<ConstantInt>(Op1)) { + if (ConstantInt *C = dyn_cast<ConstantInt>(Op1)) // X udiv C, where C >= signbit if (C->getValue().isNegative()) { - Value *IC = Builder->CreateICmpULT(Op0, C); - return SelectInst::Create(IC, Constant::getNullValue(I.getType()), - ConstantInt::get(I.getType(), 1)); + Actions.push_back(UDivFoldAction(foldUDivNegCst, C)); + return Actions.size(); } + + // X udiv (C1 << N), where C1 is "1<<C2" --> X >> (N+C2) + if (match(Op1, m_Shl(m_Power2(), m_Value())) || + match(Op1, m_ZExt(m_Shl(m_Power2(), m_Value())))) { + Actions.push_back(UDivFoldAction(foldUDivShl, Op1)); + return Actions.size(); } + // The remaining tests are all recursive, so bail out if we hit the limit. + if (Depth++ == MaxDepth) + return 0; + + if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) + if (size_t LHSIdx = visitUDivOperand(Op0, SI->getOperand(1), I, Actions)) + if (visitUDivOperand(Op0, SI->getOperand(2), I, Actions)) { + Actions.push_back(UDivFoldAction((FoldUDivOperandCb)0, Op1, LHSIdx-1)); + return Actions.size(); + } + + return 0; +} + +Instruction *InstCombiner::visitUDiv(BinaryOperator &I) { + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + if (Value *V = SimplifyUDivInst(Op0, Op1, TD)) + return ReplaceInstUsesWith(I, V); + + // Handle the integer div common cases + if (Instruction *Common = commonIDivTransforms(I)) + return Common; + // (x lshr C1) udiv C2 --> x udiv (C2 << C1) if (ConstantInt *C2 = dyn_cast<ConstantInt>(Op1)) { Value *X; @@ -747,38 +863,6 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) { } } - // X udiv (C1 << N), where C1 is "1<<C2" --> X >> (N+C2) - { const APInt *CI; Value *N; - if (match(Op1, m_Shl(m_Power2(CI), m_Value(N))) || - match(Op1, m_ZExt(m_Shl(m_Power2(CI), m_Value(N))))) { - if (*CI != 1) - N = Builder->CreateAdd(N, - ConstantInt::get(N->getType(), CI->logBase2())); - if (ZExtInst *Z = dyn_cast<ZExtInst>(Op1)) - N = Builder->CreateZExt(N, Z->getDestTy()); - if (I.isExact()) - return BinaryOperator::CreateExactLShr(Op0, N); - return BinaryOperator::CreateLShr(Op0, N); - } - } - - // udiv X, (Select Cond, C1, C2) --> Select Cond, (shr X, C1), (shr X, C2) - // where C1&C2 are powers of two. - { Value *Cond; const APInt *C1, *C2; - if (match(Op1, m_Select(m_Value(Cond), m_Power2(C1), m_Power2(C2)))) { - // Construct the "on true" case of the select - Value *TSI = Builder->CreateLShr(Op0, C1->logBase2(), Op1->getName()+".t", - I.isExact()); - - // Construct the "on false" case of the select - Value *FSI = Builder->CreateLShr(Op0, C2->logBase2(), Op1->getName()+".f", - I.isExact()); - - // construct the select instruction and return it. - return SelectInst::Create(Cond, TSI, FSI); - } - } - // (zext A) udiv (zext B) --> zext (A udiv B) if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0)) if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy())) @@ -786,6 +870,37 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) { I.isExact()), I.getType()); + // (LHS udiv (select (select (...)))) -> (LHS >> (select (select (...)))) + SmallVector<UDivFoldAction, 6> UDivActions; + if (visitUDivOperand(Op0, Op1, I, UDivActions)) + for (unsigned i = 0, e = UDivActions.size(); i != e; ++i) { + FoldUDivOperandCb Action = UDivActions[i].FoldAction; + Value *ActionOp1 = UDivActions[i].OperandToFold; + Instruction *Inst; + if (Action) + Inst = Action(Op0, ActionOp1, I, *this); + else { + // This action joins two actions together. The RHS of this action is + // simply the last action we processed, we saved the LHS action index in + // the joining action. + size_t SelectRHSIdx = i - 1; + Value *SelectRHS = UDivActions[SelectRHSIdx].FoldResult; + size_t SelectLHSIdx = UDivActions[i].SelectLHSIdx; + Value *SelectLHS = UDivActions[SelectLHSIdx].FoldResult; + Inst = SelectInst::Create(cast<SelectInst>(ActionOp1)->getCondition(), + SelectLHS, SelectRHS); + } + + // If this is the last action to process, return it to the InstCombiner. + // Otherwise, we insert it before the UDiv and record it so that we may + // use it as part of a joining action (i.e., a SelectInst). + if (e - i != 1) { + Inst->insertBefore(&I); + UDivActions[i].FoldResult = Inst; + } else + return Inst; + } + return 0; } @@ -846,7 +961,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) { /// FP value and: /// 1) 1/C is exact, or /// 2) reciprocal is allowed. -/// If the convertion was successful, the simplified expression "X * 1/C" is +/// If the conversion was successful, the simplified expression "X * 1/C" is /// returned; otherwise, NULL is returned. /// static Instruction *CvtFDivConstToReciprocal(Value *Dividend, @@ -856,7 +971,7 @@ static Instruction *CvtFDivConstToReciprocal(Value *Dividend, APFloat Reciprocal(FpVal.getSemantics()); bool Cvt = FpVal.getExactInverse(&Reciprocal); - if (!Cvt && AllowReciprocal && FpVal.isNormal()) { + if (!Cvt && AllowReciprocal && FpVal.isFiniteNonZero()) { Reciprocal = APFloat(FpVal.getSemantics(), 1.0f); (void)Reciprocal.divide(FpVal, APFloat::rmNearestTiesToEven); Cvt = !Reciprocal.isDenormal(); @@ -876,10 +991,19 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) { if (Value *V = SimplifyFDivInst(Op0, Op1, TD)) return ReplaceInstUsesWith(I, V); + if (isa<Constant>(Op0)) + if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + bool AllowReassociate = I.hasUnsafeAlgebra(); bool AllowReciprocal = I.hasAllowReciprocal(); if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) { + if (SelectInst *SI = dyn_cast<SelectInst>(Op0)) + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + if (AllowReassociate) { ConstantFP *C1 = 0; ConstantFP *C2 = Op1C; @@ -891,14 +1015,14 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) { // Constant *C = ConstantExpr::getFDiv(C1, C2); const APFloat &F = cast<ConstantFP>(C)->getValueAPF(); - if (F.isNormal() && !F.isDenormal()) + if (F.isNormal()) Res = BinaryOperator::CreateFMul(X, C); } else if (match(Op0, m_FDiv(m_Value(X), m_ConstantFP(C1)))) { // (X/C1)/C2 => X /(C2*C1) [=> X * 1/(C2*C1) if reciprocal is allowed] // Constant *C = ConstantExpr::getFMul(C1, C2); const APFloat &F = cast<ConstantFP>(C)->getValueAPF(); - if (F.isNormal() && !F.isDenormal()) { + if (F.isNormal()) { Res = CvtFDivConstToReciprocal(X, cast<ConstantFP>(C), AllowReciprocal); if (!Res) @@ -939,7 +1063,7 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) { if (Fold) { const APFloat &FoldC = cast<ConstantFP>(Fold)->getValueAPF(); - if (FoldC.isNormal() && !FoldC.isDenormal()) { + if (FoldC.isNormal()) { Instruction *R = CreateDiv ? BinaryOperator::CreateFDiv(Fold, X) : BinaryOperator::CreateFMul(X, Fold); @@ -1027,37 +1151,26 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) { if (Instruction *common = commonIRemTransforms(I)) return common; - // X urem C^2 -> X and C-1 - { const APInt *C; - if (match(Op1, m_Power2(C))) - return BinaryOperator::CreateAnd(Op0, - ConstantInt::get(I.getType(), *C-1)); - } + // (zext A) urem (zext B) --> zext (A urem B) + if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0)) + if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy())) + return new ZExtInst(Builder->CreateURem(ZOp0->getOperand(0), ZOp1), + I.getType()); - // Turn A % (C << N), where C is 2^k, into A & ((C << N)-1) - if (match(Op1, m_Shl(m_Power2(), m_Value()))) { + // X urem Y -> X and Y-1, where Y is a power of 2, + if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/true)) { Constant *N1 = Constant::getAllOnesValue(I.getType()); Value *Add = Builder->CreateAdd(Op1, N1); return BinaryOperator::CreateAnd(Op0, Add); } - // urem X, (select Cond, 2^C1, 2^C2) --> - // select Cond, (and X, C1-1), (and X, C2-1) - // when C1&C2 are powers of two. - { Value *Cond; const APInt *C1, *C2; - if (match(Op1, m_Select(m_Value(Cond), m_Power2(C1), m_Power2(C2)))) { - Value *TrueAnd = Builder->CreateAnd(Op0, *C1-1, Op1->getName()+".t"); - Value *FalseAnd = Builder->CreateAnd(Op0, *C2-1, Op1->getName()+".f"); - return SelectInst::Create(Cond, TrueAnd, FalseAnd); - } + // 1 urem X -> zext(X != 1) + if (match(Op0, m_One())) { + Value *Cmp = Builder->CreateICmpNE(Op1, Op0); + Value *Ext = Builder->CreateZExt(Cmp, I.getType()); + return ReplaceInstUsesWith(I, Ext); } - // (zext A) urem (zext B) --> zext (A urem B) - if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0)) - if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy())) - return new ZExtInst(Builder->CreateURem(ZOp0->getOperand(0), ZOp1), - I.getType()); - return 0; } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index bd14e81..4c6d0c4 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -604,8 +604,6 @@ namespace llvm { LHS.Width == RHS.Width; } }; - template <> - struct isPodLike<LoweredPHIRecord> { static const bool value = true; }; } @@ -688,10 +686,10 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { // extracted out of it. First, sort the users by their offset and size. array_pod_sort(PHIUsers.begin(), PHIUsers.end()); - DEBUG(errs() << "SLICING UP PHI: " << FirstPhi << '\n'; - for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i) - errs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] <<'\n'; - ); + DEBUG(dbgs() << "SLICING UP PHI: " << FirstPhi << '\n'; + for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i) + dbgs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] << '\n'; + ); // PredValues - This is a temporary used when rewriting PHI nodes. It is // hoisted out here to avoid construction/destruction thrashing. @@ -772,7 +770,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { } PredValues.clear(); - DEBUG(errs() << " Made element PHI for offset " << Offset << ": " + DEBUG(dbgs() << " Made element PHI for offset " << Offset << ": " << *EltPHI << '\n'); ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI; } @@ -792,7 +790,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { // PHINode simplification // Instruction *InstCombiner::visitPHINode(PHINode &PN) { - if (Value *V = SimplifyInstruction(&PN, TD)) + if (Value *V = SimplifyInstruction(&PN, TD, TLI)) return ReplaceInstUsesWith(PN, V); // If all PHI operands are the same operation, pull them through the PHI, diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 59502fb..283bec2 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -367,7 +367,7 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal, Value *FalseVal, InstCombiner::BuilderTy *Builder) { const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition()); - if (!IC || !IC->isEquality()) + if (!IC || !IC->isEquality() || !SI.getType()->isIntegerTy()) return 0; Value *CmpLHS = IC->getOperand(0); @@ -662,7 +662,7 @@ static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal, ConstantInt *FalseVal, InstCombiner::BuilderTy *Builder) { const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition()); - if (!IC || !IC->isEquality()) + if (!IC || !IC->isEquality() || !SI.getType()->isIntegerTy()) return 0; if (!match(IC->getOperand(1), m_Zero())) @@ -670,8 +670,7 @@ static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal, ConstantInt *AndRHS; Value *LHS = IC->getOperand(0); - if (LHS->getType() != SI.getType() || - !match(LHS, m_And(m_Value(), m_ConstantInt(AndRHS)))) + if (!match(LHS, m_And(m_Value(), m_ConstantInt(AndRHS)))) return 0; // If both select arms are non-zero see if we have a select of the form @@ -705,7 +704,13 @@ static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal, unsigned ValZeros = ValC->getValue().logBase2(); unsigned AndZeros = AndRHS->getValue().logBase2(); - Value *V = LHS; + // If types don't match we can still convert the select by introducing a zext + // or a trunc of the 'and'. The trunc case requires that all of the truncated + // bits are zero, we can figure that out by looking at the 'and' mask. + if (AndZeros >= ValC->getBitWidth()) + return 0; + + Value *V = Builder->CreateZExtOrTrunc(LHS, SI.getType()); if (ValZeros > AndZeros) V = Builder->CreateShl(V, ValZeros - AndZeros); else if (ValZeros < AndZeros) diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 60d672b..c831ddd 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -754,7 +754,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1); // If it's known zero, our sign bit is also zero. if (LHSKnownZero.isNegative()) - KnownZero |= LHSKnownZero; + KnownZero.setBit(KnownZero.getBitWidth() - 1); } break; case Instruction::URem: { @@ -808,7 +808,6 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // TODO: Could compute known zero/one bits based on the input. break; } - case Intrinsic::x86_sse42_crc32_64_8: case Intrinsic::x86_sse42_crc32_64_64: KnownZero = APInt::getHighBitsSet(64, 32); return 0; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 4301ddb..1e72410 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -106,8 +106,8 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) { } // If we have a PHI node with a vector type that has only 2 uses: feed -// itself and be an operand of extractelemnt at a constant location, -// try to replace the PHI of the vector type with a PHI of a scalar type +// itself and be an operand of extractelement at a constant location, +// try to replace the PHI of the vector type with a PHI of a scalar type. Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { // Verify that the PHI node has exactly 2 uses. Otherwise return NULL. if (!PN->hasNUses(2)) @@ -125,17 +125,15 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { // and that it is a binary operation which is cheap to scalarize. // otherwise return NULL. if (!PHIUser->hasOneUse() || !(PHIUser->use_back() == PN) || - !(isa<BinaryOperator>(PHIUser)) || - !CheapToScalarize(PHIUser, true)) + !(isa<BinaryOperator>(PHIUser)) || !CheapToScalarize(PHIUser, true)) return NULL; // Create a scalar PHI node that will replace the vector PHI node // just before the current PHI node. - PHINode * scalarPHI = cast<PHINode>( - InsertNewInstWith(PHINode::Create(EI.getType(), - PN->getNumIncomingValues(), ""), *PN)); + PHINode *scalarPHI = cast<PHINode>(InsertNewInstWith( + PHINode::Create(EI.getType(), PN->getNumIncomingValues(), ""), *PN)); // Scalarize each PHI operand. - for (unsigned i=0; i < PN->getNumIncomingValues(); i++) { + for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) { Value *PHIInVal = PN->getIncomingValue(i); BasicBlock *inBB = PN->getIncomingBlock(i); Value *Elt = EI.getIndexOperand(); @@ -145,17 +143,17 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { // scalar PHI and the second operand is extracted from the other // vector operand. BinaryOperator *B0 = cast<BinaryOperator>(PHIUser); - unsigned opId = (B0->getOperand(0) == PN) ? 1: 0; - Value *Op = Builder->CreateExtractElement( - B0->getOperand(opId), Elt, B0->getOperand(opId)->getName()+".Elt"); + unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0; + Value *Op = InsertNewInstWith( + ExtractElementInst::Create(B0->getOperand(opId), Elt, + B0->getOperand(opId)->getName() + ".Elt"), + *B0); Value *newPHIUser = InsertNewInstWith( - BinaryOperator::Create(B0->getOpcode(), scalarPHI,Op), - *B0); + BinaryOperator::Create(B0->getOpcode(), scalarPHI, Op), *B0); scalarPHI->addIncoming(newPHIUser, inBB); } else { // Scalarize PHI input: - Instruction *newEI = - ExtractElementInst::Create(PHIInVal, Elt, ""); + Instruction *newEI = ExtractElementInst::Create(PHIInVal, Elt, ""); // Insert the new instruction into the predecessor basic block. Instruction *pos = dyn_cast<Instruction>(PHIInVal); BasicBlock::iterator InsertPos; @@ -224,7 +222,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { if (PHINode *PN = dyn_cast<PHINode>(EI.getOperand(0))) { Instruction *scalarPHI = scalarizePHI(EI, PN); if (scalarPHI) - return (scalarPHI); + return scalarPHI; } } @@ -284,6 +282,38 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { Worklist.AddValue(EE); return CastInst::Create(CI->getOpcode(), EE, EI.getType()); } + } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) { + if (SI->hasOneUse()) { + // TODO: For a select on vectors, it might be useful to do this if it + // has multiple extractelement uses. For vector select, that seems to + // fight the vectorizer. + + // If we are extracting an element from a vector select or a select on + // vectors, a select on the scalars extracted from the vector arguments. + Value *TrueVal = SI->getTrueValue(); + Value *FalseVal = SI->getFalseValue(); + + Value *Cond = SI->getCondition(); + if (Cond->getType()->isVectorTy()) { + Cond = Builder->CreateExtractElement(Cond, + EI.getIndexOperand(), + Cond->getName() + ".elt"); + } + + Value *V1Elem + = Builder->CreateExtractElement(TrueVal, + EI.getIndexOperand(), + TrueVal->getName() + ".elt"); + + Value *V2Elem + = Builder->CreateExtractElement(FalseVal, + EI.getIndexOperand(), + FalseVal->getName() + ".elt"); + return SelectInst::Create(Cond, + V1Elem, + V2Elem, + SI->getName() + ".elt"); + } } } return 0; @@ -296,7 +326,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, SmallVectorImpl<Constant*> &Mask) { assert(V->getType() == LHS->getType() && V->getType() == RHS->getType() && "Invalid CollectSingleShuffleElements"); - unsigned NumElts = cast<VectorType>(V->getType())->getNumElements(); + unsigned NumElts = V->getType()->getVectorNumElements(); if (isa<UndefValue>(V)) { Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext()))); @@ -496,6 +526,254 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) { return 0; } +/// Return true if we can evaluate the specified expression tree if the vector +/// elements were shuffled in a different order. +static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask, + unsigned Depth = 5) { + // We can always reorder the elements of a constant. + if (isa<Constant>(V)) + return true; + + // We won't reorder vector arguments. No IPO here. + Instruction *I = dyn_cast<Instruction>(V); + if (!I) return false; + + // Two users may expect different orders of the elements. Don't try it. + if (!I->hasOneUse()) + return false; + + if (Depth == 0) return false; + + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::FCmp: + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::GetElementPtr: { + for (int i = 0, e = I->getNumOperands(); i != e; ++i) { + if (!CanEvaluateShuffled(I->getOperand(i), Mask, Depth-1)) + return false; + } + return true; + } + case Instruction::InsertElement: { + ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2)); + if (!CI) return false; + int ElementNumber = CI->getLimitedValue(); + + // Verify that 'CI' does not occur twice in Mask. A single 'insertelement' + // can't put an element into multiple indices. + bool SeenOnce = false; + for (int i = 0, e = Mask.size(); i != e; ++i) { + if (Mask[i] == ElementNumber) { + if (SeenOnce) + return false; + SeenOnce = true; + } + } + return CanEvaluateShuffled(I->getOperand(0), Mask, Depth-1); + } + } + return false; +} + +/// Rebuild a new instruction just like 'I' but with the new operands given. +/// In the event of type mismatch, the type of the operands is correct. +static Value *BuildNew(Instruction *I, ArrayRef<Value*> NewOps) { + // We don't want to use the IRBuilder here because we want the replacement + // instructions to appear next to 'I', not the builder's insertion point. + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + BinaryOperator *BO = cast<BinaryOperator>(I); + assert(NewOps.size() == 2 && "binary operator with #ops != 2"); + BinaryOperator *New = + BinaryOperator::Create(cast<BinaryOperator>(I)->getOpcode(), + NewOps[0], NewOps[1], "", BO); + if (isa<OverflowingBinaryOperator>(BO)) { + New->setHasNoUnsignedWrap(BO->hasNoUnsignedWrap()); + New->setHasNoSignedWrap(BO->hasNoSignedWrap()); + } + if (isa<PossiblyExactOperator>(BO)) { + New->setIsExact(BO->isExact()); + } + return New; + } + case Instruction::ICmp: + assert(NewOps.size() == 2 && "icmp with #ops != 2"); + return new ICmpInst(I, cast<ICmpInst>(I)->getPredicate(), + NewOps[0], NewOps[1]); + case Instruction::FCmp: + assert(NewOps.size() == 2 && "fcmp with #ops != 2"); + return new FCmpInst(I, cast<FCmpInst>(I)->getPredicate(), + NewOps[0], NewOps[1]); + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: { + // It's possible that the mask has a different number of elements from + // the original cast. We recompute the destination type to match the mask. + Type *DestTy = + VectorType::get(I->getType()->getScalarType(), + NewOps[0]->getType()->getVectorNumElements()); + assert(NewOps.size() == 1 && "cast with #ops != 1"); + return CastInst::Create(cast<CastInst>(I)->getOpcode(), NewOps[0], DestTy, + "", I); + } + case Instruction::GetElementPtr: { + Value *Ptr = NewOps[0]; + ArrayRef<Value*> Idx = NewOps.slice(1); + GetElementPtrInst *GEP = GetElementPtrInst::Create(Ptr, Idx, "", I); + GEP->setIsInBounds(cast<GetElementPtrInst>(I)->isInBounds()); + return GEP; + } + } + llvm_unreachable("failed to rebuild vector instructions"); +} + +Value * +InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { + // Mask.size() does not need to be equal to the number of vector elements. + + assert(V->getType()->isVectorTy() && "can't reorder non-vector elements"); + if (isa<UndefValue>(V)) { + return UndefValue::get(VectorType::get(V->getType()->getScalarType(), + Mask.size())); + } + if (isa<ConstantAggregateZero>(V)) { + return ConstantAggregateZero::get( + VectorType::get(V->getType()->getScalarType(), + Mask.size())); + } + if (Constant *C = dyn_cast<Constant>(V)) { + SmallVector<Constant *, 16> MaskValues; + for (int i = 0, e = Mask.size(); i != e; ++i) { + if (Mask[i] == -1) + MaskValues.push_back(UndefValue::get(Builder->getInt32Ty())); + else + MaskValues.push_back(Builder->getInt32(Mask[i])); + } + return ConstantExpr::getShuffleVector(C, UndefValue::get(C->getType()), + ConstantVector::get(MaskValues)); + } + + Instruction *I = cast<Instruction>(V); + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::FCmp: + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::Select: + case Instruction::GetElementPtr: { + SmallVector<Value*, 8> NewOps; + bool NeedsRebuild = (Mask.size() != I->getType()->getVectorNumElements()); + for (int i = 0, e = I->getNumOperands(); i != e; ++i) { + Value *V = EvaluateInDifferentElementOrder(I->getOperand(i), Mask); + NewOps.push_back(V); + NeedsRebuild |= (V != I->getOperand(i)); + } + if (NeedsRebuild) { + return BuildNew(I, NewOps); + } + return I; + } + case Instruction::InsertElement: { + int Element = cast<ConstantInt>(I->getOperand(2))->getLimitedValue(); + + // The insertelement was inserting at Element. Figure out which element + // that becomes after shuffling. The answer is guaranteed to be unique + // by CanEvaluateShuffled. + bool Found = false; + int Index = 0; + for (int e = Mask.size(); Index != e; ++Index) { + if (Mask[Index] == Element) { + Found = true; + break; + } + } + + if (!Found) + return UndefValue::get( + VectorType::get(V->getType()->getScalarType(), Mask.size())); + + Value *V = EvaluateInDifferentElementOrder(I->getOperand(0), Mask); + return InsertElementInst::Create(V, I->getOperand(1), + Builder->getInt32(Index), "", I); + } + } + llvm_unreachable("failed to reorder elements of vector instruction!"); +} Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Value *LHS = SVI.getOperand(0); @@ -527,9 +805,9 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { if (LHS == RHS || isa<UndefValue>(LHS)) { if (isa<UndefValue>(LHS) && LHS == RHS) { // shuffle(undef,undef,mask) -> undef. - Value* result = (VWidth == LHSWidth) + Value *Result = (VWidth == LHSWidth) ? LHS : UndefValue::get(SVI.getType()); - return ReplaceInstUsesWith(SVI, result); + return ReplaceInstUsesWith(SVI, Result); } // Remap any references to RHS to use LHS. @@ -576,6 +854,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { if (isRHSID) return ReplaceInstUsesWith(SVI, RHS); } + if (isa<UndefValue>(RHS) && CanEvaluateShuffled(LHS, Mask)) { + Value *V = EvaluateInDifferentElementOrder(LHS, Mask); + return ReplaceInstUsesWith(SVI, V); + } + // If the LHS is a shufflevector itself, see if we can combine it with this // one without producing an unusual shuffle. // Cases that might be simplified: diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h index 49efce5..f84db27 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h @@ -1,4 +1,4 @@ -//===- InstCombineWorklist.h - Worklist for the InstCombine pass ----------===// +//===- InstCombineWorklist.h - Worklist for InstCombine pass ----*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -37,7 +37,7 @@ public: /// in it. void Add(Instruction *I) { if (WorklistMap.insert(std::make_pair(I, Worklist.size())).second) { - DEBUG(errs() << "IC: ADD: " << *I << '\n'); + DEBUG(dbgs() << "IC: ADD: " << *I << '\n'); Worklist.push_back(I); } } @@ -54,7 +54,7 @@ public: assert(Worklist.empty() && "Worklist must be empty to add initial group"); Worklist.reserve(NumEntries+16); WorklistMap.resize(NumEntries); - DEBUG(errs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n"); + DEBUG(dbgs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n"); for (unsigned Idx = 0; NumEntries; --NumEntries) { Instruction *I = List[NumEntries-1]; WorklistMap.insert(std::make_pair(I, Idx++)); @@ -74,8 +74,7 @@ public: } Instruction *RemoveOne() { - Instruction *I = Worklist.back(); - Worklist.pop_back(); + Instruction *I = Worklist.pop_back_val(); WorklistMap.erase(I); return I; } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index ec10751..191a101 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -699,7 +699,10 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { Value *TrueVInPred = TrueV->DoPHITranslation(PhiTransBB, ThisBB); Value *FalseVInPred = FalseV->DoPHITranslation(PhiTransBB, ThisBB); Value *InV = 0; - if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) + // Beware of ConstantExpr: it may eventually evaluate to getNullValue, + // even if currently isNullValue gives false. + Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)); + if (InC && !isa<ConstantExpr>(InC)) InV = InC->isNullValue() ? FalseVInPred : TrueVInPred; else InV = Builder->CreateSelect(PN->getIncomingValue(i), @@ -755,19 +758,25 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { return ReplaceInstUsesWith(I, NewPN); } -/// FindElementAtOffset - Given a type and a constant offset, determine whether -/// or not there is a sequence of GEP indices into the type that will land us at -/// the specified offset. If so, fill them into NewIndices and return the -/// resultant element type, otherwise return null. -Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset, - SmallVectorImpl<Value*> &NewIndices) { - if (!TD) return 0; - if (!Ty->isSized()) return 0; +/// FindElementAtOffset - Given a pointer type and a constant offset, determine +/// whether or not there is a sequence of GEP indices into the pointed type that +/// will land us at the specified offset. If so, fill them into NewIndices and +/// return the resultant element type, otherwise return null. +Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset, + SmallVectorImpl<Value*> &NewIndices) { + assert(PtrTy->isPtrOrPtrVectorTy()); + + if (!TD) + return 0; + + Type *Ty = PtrTy->getPointerElementType(); + if (!Ty->isSized()) + return 0; // Start with the index over the outer type. Note that the type size // might be zero (even if the offset isn't zero) if the indexed type // is something like [0 x {int, int}] - Type *IntPtrTy = TD->getIntPtrType(Ty->getContext()); + Type *IntPtrTy = TD->getIntPtrType(PtrTy); int64_t FirstIdx = 0; if (int64_t TySize = TD->getTypeAllocSize(Ty)) { FirstIdx = Offset/TySize; @@ -1176,6 +1185,22 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { GetElementPtrInst::Create(Src->getOperand(0), Indices, GEP.getName()); } + // Canonicalize (gep i8* X, -(ptrtoint Y)) to (sub (ptrtoint X), (ptrtoint Y)) + // The GEP pattern is emitted by the SCEV expander for certain kinds of + // pointer arithmetic. + if (TD && GEP.getNumIndices() == 1 && + match(GEP.getOperand(1), m_Neg(m_PtrToInt(m_Value())))) { + unsigned AS = GEP.getPointerAddressSpace(); + if (GEP.getType() == Builder->getInt8PtrTy(AS) && + GEP.getOperand(1)->getType()->getScalarSizeInBits() == + TD->getPointerSizeInBits(AS)) { + Operator *Index = cast<Operator>(GEP.getOperand(1)); + Value *PtrToInt = Builder->CreatePtrToInt(PtrOp, Index->getType()); + Value *NewSub = Builder->CreateSub(PtrToInt, Index->getOperand(1)); + return CastInst::Create(Instruction::IntToPtr, NewSub, GEP.getType()); + } + } + // Handle gep(bitcast x) and gep(gep x, 0, 0, 0). Value *StrippedPtr = PtrOp->stripPointerCasts(); PointerType *StrippedPtrTy = dyn_cast<PointerType>(StrippedPtr->getType()); @@ -1231,13 +1256,12 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // %t = getelementptr i32* bitcast ([2 x i32]* %str to i32*), i32 %V // into: %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast Type *SrcElTy = StrippedPtrTy->getElementType(); - Type *ResElTy=cast<PointerType>(PtrOp->getType())->getElementType(); + Type *ResElTy = PtrOp->getType()->getPointerElementType(); if (TD && SrcElTy->isArrayTy() && - TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()) == + TD->getTypeAllocSize(SrcElTy->getArrayElementType()) == TD->getTypeAllocSize(ResElTy)) { - Value *Idx[2]; - Idx[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext())); - Idx[1] = GEP.getOperand(1); + Type *IdxType = TD->getIntPtrType(GEP.getType()); + Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) }; Value *NewGEP = GEP.isInBounds() ? Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()) : Builder->CreateGEP(StrippedPtr, Idx, GEP.getName()); @@ -1261,7 +1285,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Earlier transforms ensure that the index has type IntPtrType, which // considerably simplifies the logic by eliminating implicit casts. - assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) && + assert(Idx->getType() == TD->getIntPtrType(GEP.getType()) && "Index not cast to pointer width?"); bool NSW; @@ -1287,8 +1311,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Check that changing to the array element type amounts to dividing the // index by a scale factor. uint64_t ResSize = TD->getTypeAllocSize(ResElTy); - uint64_t ArrayEltSize = - TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()); + uint64_t ArrayEltSize + = TD->getTypeAllocSize(SrcElTy->getArrayElementType()); if (ResSize && ArrayEltSize % ResSize == 0) { Value *Idx = GEP.getOperand(1); unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits(); @@ -1296,7 +1320,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Earlier transforms ensure that the index has type IntPtrType, which // considerably simplifies the logic by eliminating implicit casts. - assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) && + assert(Idx->getType() == TD->getIntPtrType(GEP.getType()) && "Index not cast to pointer width?"); bool NSW; @@ -1304,9 +1328,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Successfully decomposed Idx as NewIdx * Scale, form a new GEP. // If the multiplication NewIdx * Scale may overflow then the new // GEP may not be "inbounds". - Value *Off[2]; - Off[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext())); - Off[1] = NewIdx; + Value *Off[2] = { + Constant::getNullValue(TD->getIntPtrType(GEP.getType())), + NewIdx + }; + Value *NewGEP = GEP.isInBounds() && NSW ? Builder->CreateInBoundsGEP(StrippedPtr, Off, GEP.getName()) : Builder->CreateGEP(StrippedPtr, Off, GEP.getName()); @@ -1318,15 +1344,20 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { } } + if (!TD) + return 0; + /// See if we can simplify: /// X = bitcast A* to B* /// Y = gep X, <...constant indices...> /// into a gep of the original struct. This is important for SROA and alias /// analysis of unions. If "A" is also a bitcast, wait for A/X to be merged. if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) { - APInt Offset(TD ? TD->getPointerSizeInBits() : 1, 0); - if (TD && - !isa<BitCastInst>(BCI->getOperand(0)) && + Value *Operand = BCI->getOperand(0); + PointerType *OpType = cast<PointerType>(Operand->getType()); + unsigned OffsetBits = TD->getPointerTypeSizeInBits(OpType); + APInt Offset(OffsetBits, 0); + if (!isa<BitCastInst>(Operand) && GEP.accumulateConstantOffset(*TD, Offset) && StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) { @@ -1335,8 +1366,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (!Offset) { // If the bitcast is of an allocation, and the allocation will be // converted to match the type of the cast, don't touch this. - if (isa<AllocaInst>(BCI->getOperand(0)) || - isAllocationFn(BCI->getOperand(0), TLI)) { + if (isa<AllocaInst>(Operand) || isAllocationFn(Operand, TLI)) { // See if the bitcast simplifies, if so, don't nuke this GEP yet. if (Instruction *I = visitBitCast(*BCI)) { if (I != BCI) { @@ -1347,19 +1377,17 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { return &GEP; } } - return new BitCastInst(BCI->getOperand(0), GEP.getType()); + return new BitCastInst(Operand, GEP.getType()); } // Otherwise, if the offset is non-zero, we need to find out if there is a // field at Offset in 'A's type. If so, we can pull the cast through the // GEP. SmallVector<Value*, 8> NewIndices; - Type *InTy = - cast<PointerType>(BCI->getOperand(0)->getType())->getElementType(); - if (FindElementAtOffset(InTy, Offset.getSExtValue(), NewIndices)) { + if (FindElementAtOffset(OpType, Offset.getSExtValue(), NewIndices)) { Value *NGEP = GEP.isInBounds() ? - Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices) : - Builder->CreateGEP(BCI->getOperand(0), NewIndices); + Builder->CreateInBoundsGEP(Operand, NewIndices) : + Builder->CreateGEP(Operand, NewIndices); if (NGEP->getType() == GEP.getType()) return ReplaceInstUsesWith(GEP, NGEP); @@ -1372,8 +1400,6 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { return 0; } - - static bool isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users, const TargetLibraryInfo *TLI) { @@ -2042,7 +2068,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { continue; // If Filter is a subset of LFilter, i.e. every element of Filter is also // an element of LFilter, then discard LFilter. - SmallVector<Value *, 16>::iterator J = NewClauses.begin() + j; + SmallVectorImpl<Value *>::iterator J = NewClauses.begin() + j; // If Filter is empty then it is a subset of LFilter. if (!FElts) { // Discard LFilter. @@ -2209,7 +2235,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, // DCE instruction if trivially dead. if (isInstructionTriviallyDead(Inst, TLI)) { ++NumDeadInst; - DEBUG(errs() << "IC: DCE: " << *Inst << '\n'); + DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n'); Inst->eraseFromParent(); continue; } @@ -2217,7 +2243,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, // ConstantProp instruction if trivially constant. if (!Inst->use_empty() && isa<Constant>(Inst->getOperand(0))) if (Constant *C = ConstantFoldInstruction(Inst, TD, TLI)) { - DEBUG(errs() << "IC: ConstFold to: " << *C << " from: " + DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *Inst << '\n'); Inst->replaceAllUsesWith(C); ++NumConstProp; @@ -2293,7 +2319,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { MadeIRChange = false; - DEBUG(errs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " + DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " << F.getName() << "\n"); { @@ -2338,7 +2364,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { // Check to see if we can DCE the instruction. if (isInstructionTriviallyDead(I, TLI)) { - DEBUG(errs() << "IC: DCE: " << *I << '\n'); + DEBUG(dbgs() << "IC: DCE: " << *I << '\n'); EraseInstFromFunction(*I); ++NumDeadInst; MadeIRChange = true; @@ -2348,7 +2374,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { // Instruction isn't dead, see if we can constant propagate it. if (!I->use_empty() && isa<Constant>(I->getOperand(0))) if (Constant *C = ConstantFoldInstruction(I, TD, TLI)) { - DEBUG(errs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n'); + DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n'); // Add operands to the worklist. ReplaceInstUsesWith(*I, C); @@ -2396,13 +2422,13 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { std::string OrigI; #endif DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str();); - DEBUG(errs() << "IC: Visiting: " << OrigI << '\n'); + DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n'); if (Instruction *Result = visit(*I)) { ++NumCombined; // Should we replace the old instruction with a new one? if (Result != I) { - DEBUG(errs() << "IC: Old = " << *I << '\n' + DEBUG(dbgs() << "IC: Old = " << *I << '\n' << " New = " << *Result << '\n'); if (!I->getDebugLoc().isUnknown()) @@ -2431,7 +2457,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { EraseInstFromFunction(*I); } else { #ifndef NDEBUG - DEBUG(errs() << "IC: Mod = " << OrigI << '\n' + DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n' << " New = " << *I << '\n'); #endif diff --git a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 623c470..d731ec5 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" #include "llvm/DIBuilder.h" @@ -39,13 +40,14 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Endian.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/system_error.h" -#include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/BlackList.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Transforms/Utils/SpecialCaseList.h" #include <algorithm> #include <string> @@ -56,36 +58,49 @@ static const uint64_t kDefaultShadowOffset32 = 1ULL << 29; static const uint64_t kDefaultShadowOffset64 = 1ULL << 44; static const uint64_t kDefaultShort64bitShadowOffset = 0x7FFF8000; // < 2G. static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41; +static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa8000; +static const size_t kMinStackMallocSize = 1 << 6; // 64B static const size_t kMaxStackMallocSize = 1 << 16; // 64K static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3; static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E; -static const char *kAsanModuleCtorName = "asan.module_ctor"; -static const char *kAsanModuleDtorName = "asan.module_dtor"; -static const int kAsanCtorAndCtorPriority = 1; -static const char *kAsanReportErrorTemplate = "__asan_report_"; -static const char *kAsanReportLoadN = "__asan_report_load_n"; -static const char *kAsanReportStoreN = "__asan_report_store_n"; -static const char *kAsanRegisterGlobalsName = "__asan_register_globals"; -static const char *kAsanUnregisterGlobalsName = "__asan_unregister_globals"; -static const char *kAsanPoisonGlobalsName = "__asan_before_dynamic_init"; -static const char *kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init"; -static const char *kAsanInitName = "__asan_init_v3"; -static const char *kAsanHandleNoReturnName = "__asan_handle_no_return"; -static const char *kAsanMappingOffsetName = "__asan_mapping_offset"; -static const char *kAsanMappingScaleName = "__asan_mapping_scale"; -static const char *kAsanStackMallocName = "__asan_stack_malloc"; -static const char *kAsanStackFreeName = "__asan_stack_free"; -static const char *kAsanGenPrefix = "__asan_gen_"; -static const char *kAsanPoisonStackMemoryName = "__asan_poison_stack_memory"; -static const char *kAsanUnpoisonStackMemoryName = +static const char *const kAsanModuleCtorName = "asan.module_ctor"; +static const char *const kAsanModuleDtorName = "asan.module_dtor"; +static const int kAsanCtorAndCtorPriority = 1; +static const char *const kAsanReportErrorTemplate = "__asan_report_"; +static const char *const kAsanReportLoadN = "__asan_report_load_n"; +static const char *const kAsanReportStoreN = "__asan_report_store_n"; +static const char *const kAsanRegisterGlobalsName = "__asan_register_globals"; +static const char *const kAsanUnregisterGlobalsName = + "__asan_unregister_globals"; +static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init"; +static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init"; +static const char *const kAsanInitName = "__asan_init_v3"; +static const char *const kAsanCovName = "__sanitizer_cov"; +static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return"; +static const char *const kAsanMappingOffsetName = "__asan_mapping_offset"; +static const char *const kAsanMappingScaleName = "__asan_mapping_scale"; +static const int kMaxAsanStackMallocSizeClass = 10; +static const char *const kAsanStackMallocNameTemplate = "__asan_stack_malloc_"; +static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_"; +static const char *const kAsanGenPrefix = "__asan_gen_"; +static const char *const kAsanPoisonStackMemoryName = + "__asan_poison_stack_memory"; +static const char *const kAsanUnpoisonStackMemoryName = "__asan_unpoison_stack_memory"; +static const char *const kAsanOptionDetectUAR = + "__asan_option_detect_stack_use_after_return"; + +// These constants must match the definitions in the run-time library. static const int kAsanStackLeftRedzoneMagic = 0xf1; static const int kAsanStackMidRedzoneMagic = 0xf2; static const int kAsanStackRightRedzoneMagic = 0xf3; static const int kAsanStackPartialRedzoneMagic = 0xf4; +#ifndef NDEBUG +static const int kAsanStackAfterReturnMagic = 0xf5; +#endif // Accesses sizes are powers of two: 1, 2, 4, 8, 16. static const size_t kNumberOfAccessSizes = 5; @@ -120,6 +135,8 @@ static cl::opt<bool> ClUseAfterReturn("asan-use-after-return", // This flag may need to be replaced with -f[no]asan-globals. static cl::opt<bool> ClGlobals("asan-globals", cl::desc("Handle global objects"), cl::Hidden, cl::init(true)); +static cl::opt<bool> ClCoverage("asan-coverage", + cl::desc("ASan coverage"), cl::Hidden, cl::init(false)); static cl::opt<bool> ClInitializers("asan-initialization-order", cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(false)); static cl::opt<bool> ClMemIntrin("asan-memintrin", @@ -130,6 +147,19 @@ static cl::opt<std::string> ClBlacklistFile("asan-blacklist", cl::desc("File containing the list of objects to ignore " "during instrumentation"), cl::Hidden); +// This is an experimental feature that will allow to choose between +// instrumented and non-instrumented code at link-time. +// If this option is on, just before instrumenting a function we create its +// clone; if the function is not changed by asan the clone is deleted. +// If we end up with a clone, we put the instrumented function into a section +// called "ASAN" and the uninstrumented function into a section called "NOASAN". +// +// This is still a prototype, we need to figure out a way to keep two copies of +// a function so that the linker can easily choose one of them. +static cl::opt<bool> ClKeepUninstrumented("asan-keep-uninstrumented-functions", + cl::desc("Keep uninstrumented copies of functions"), + cl::Hidden, cl::init(false)); + // These flags allow to change the shadow mapping. // The shadow mapping looks like // Shadow = (Mem >> scale) + (1 << offset_log) @@ -167,6 +197,13 @@ static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"), static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"), cl::Hidden, cl::init(-1)); +STATISTIC(NumInstrumentedReads, "Number of instrumented reads"); +STATISTIC(NumInstrumentedWrites, "Number of instrumented writes"); +STATISTIC(NumOptimizedAccessesToGlobalArray, + "Number of optimized accesses to global arrays"); +STATISTIC(NumOptimizedAccessesToGlobalVar, + "Number of optimized accesses to global vars"); + namespace { /// A set of dynamically initialized globals extracted from metadata. class SetOfDynamicallyInitializedGlobals { @@ -206,8 +243,11 @@ static ShadowMapping getShadowMapping(const Module &M, int LongSize, llvm::Triple TargetTriple(M.getTargetTriple()); bool IsAndroid = TargetTriple.getEnvironment() == llvm::Triple::Android; bool IsMacOSX = TargetTriple.getOS() == llvm::Triple::MacOSX; - bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64; + bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64 || + TargetTriple.getArch() == llvm::Triple::ppc64le; bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64; + bool IsMIPS32 = TargetTriple.getArch() == llvm::Triple::mips || + TargetTriple.getArch() == llvm::Triple::mipsel; ShadowMapping Mapping; @@ -217,7 +257,8 @@ static ShadowMapping getShadowMapping(const Module &M, int LongSize, Mapping.OrShadowOffset = !IsPPC64 && !ClShort64BitOffset; Mapping.Offset = (IsAndroid || ZeroBaseShadow) ? 0 : - (LongSize == 32 ? kDefaultShadowOffset32 : + (LongSize == 32 ? + (IsMIPS32 ? kMIPS32_ShadowOffset32 : kDefaultShadowOffset32) : IsPPC64 ? kPPC64_ShadowOffset64 : kDefaultShadowOffset64); if (!ZeroBaseShadow && ClShort64BitOffset && IsX86_64 && !IsMacOSX) { assert(LongSize == 64); @@ -285,6 +326,8 @@ struct AddressSanitizer : public FunctionPass { bool ShouldInstrumentGlobal(GlobalVariable *G); bool LooksLikeCodeInBug11395(Instruction *I); void FindDynamicInitializers(Module &M); + bool GlobalIsLinkerInitialized(GlobalVariable *G); + bool InjectCoverage(Function &F); bool CheckInitOrder; bool CheckUseAfterReturn; @@ -300,7 +343,8 @@ struct AddressSanitizer : public FunctionPass { Function *AsanCtorFunction; Function *AsanInitFunction; Function *AsanHandleNoReturnFunc; - OwningPtr<BlackList> BL; + Function *AsanCovFunction; + OwningPtr<SpecialCaseList> BL; // This array is indexed by AccessIsWrite and log2(AccessSize). Function *AsanErrorCallback[2][kNumberOfAccessSizes]; // This array is indexed by AccessIsWrite. @@ -340,7 +384,7 @@ class AddressSanitizerModule : public ModulePass { SmallString<64> BlacklistFile; bool ZeroBaseShadow; - OwningPtr<BlackList> BL; + OwningPtr<SpecialCaseList> BL; SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals; Type *IntptrTy; LLVMContext *C; @@ -375,12 +419,14 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { uint64_t TotalStackSize; unsigned StackAlignment; - Function *AsanStackMallocFunc, *AsanStackFreeFunc; + Function *AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1], + *AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1]; Function *AsanPoisonStackMemoryFunc, *AsanUnpoisonStackMemoryFunc; // Stores a place and arguments of poisoning/unpoisoning call for alloca. struct AllocaPoisonCall { IntrinsicInst *InsBefore; + AllocaInst *AI; uint64_t Size; bool DoPoison; }; @@ -433,7 +479,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { StackAlignment = std::max(StackAlignment, AI.getAlignment()); AllocaVec.push_back(&AI); - uint64_t AlignedSize = getAlignedAllocaSize(&AI); + uint64_t AlignedSize = getAlignedAllocaSize(&AI); TotalStackSize += AlignedSize; } @@ -459,7 +505,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { AllocaInst *AI = findAllocaForValue(II.getArgOperand(1)); if (!AI) return; bool DoPoison = (ID == Intrinsic::lifetime_end); - AllocaPoisonCall APC = {&II, SizeValue, DoPoison}; + AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison}; AllocaPoisonCallVec.push_back(APC); } @@ -467,33 +513,37 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { void initializeCallbacks(Module &M); // Check if we want (and can) handle this alloca. - bool isInterestingAlloca(AllocaInst &AI) { + bool isInterestingAlloca(AllocaInst &AI) const { return (!AI.isArrayAllocation() && AI.isStaticAlloca() && + AI.getAlignment() <= RedzoneSize() && AI.getAllocatedType()->isSized()); } size_t RedzoneSize() const { return RedzoneSizeForScale(Mapping.Scale); } - uint64_t getAllocaSizeInBytes(AllocaInst *AI) { + uint64_t getAllocaSizeInBytes(AllocaInst *AI) const { Type *Ty = AI->getAllocatedType(); uint64_t SizeInBytes = ASan.TD->getTypeAllocSize(Ty); return SizeInBytes; } - uint64_t getAlignedSize(uint64_t SizeInBytes) { + uint64_t getAlignedSize(uint64_t SizeInBytes) const { size_t RZ = RedzoneSize(); return ((SizeInBytes + RZ - 1) / RZ) * RZ; } - uint64_t getAlignedAllocaSize(AllocaInst *AI) { + uint64_t getAlignedAllocaSize(AllocaInst *AI) const { uint64_t SizeInBytes = getAllocaSizeInBytes(AI); return getAlignedSize(SizeInBytes); } /// Finds alloca where the value comes from. AllocaInst *findAllocaForValue(Value *V); - void poisonRedZones(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB, + void poisonRedZones(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> &IRB, Value *ShadowBase, bool DoPoison); - void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> IRB, bool DoPoison); + void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> &IRB, bool DoPoison); + + void SetShadowToStackAfterReturnInlined(IRBuilder<> &IRB, Value *ShadowBase, + int Size); }; } // namespace @@ -520,16 +570,16 @@ ModulePass *llvm::createAddressSanitizerModulePass( } static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { - size_t Res = CountTrailingZeros_32(TypeSize / 8); + size_t Res = countTrailingZeros(TypeSize / 8); assert(Res < kNumberOfAccessSizes); return Res; } -// Create a constant for Str so that we can pass it to the run-time lib. +// \brief Create a constant for Str so that we can pass it to the run-time lib. static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) { Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str); GlobalVariable *GV = new GlobalVariable(M, StrConst->getType(), true, - GlobalValue::PrivateLinkage, StrConst, + GlobalValue::InternalLinkage, StrConst, kAsanGenPrefix); GV->setUnnamedAddr(true); // Ok to merge these. GV->setAlignment(1); // Strings may not be merged w/o setting align 1. @@ -620,6 +670,13 @@ static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite) { return NULL; } +bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) { + // If a global variable does not have dynamic initialization we don't + // have to instrument it. However, if a global does not have initializer + // at all, we assume it has dynamic initializer (in other TU). + return G->hasInitializer() && !DynamicallyInitializedGlobals.Contains(G); +} + void AddressSanitizer::instrumentMop(Instruction *I) { bool IsWrite = false; Value *Addr = isInterestingMemoryAccess(I, &IsWrite); @@ -628,13 +685,19 @@ void AddressSanitizer::instrumentMop(Instruction *I) { if (GlobalVariable *G = dyn_cast<GlobalVariable>(Addr)) { // If initialization order checking is disabled, a simple access to a // dynamically initialized global is always valid. - if (!CheckInitOrder) - return; - // If a global variable does not have dynamic initialization we don't - // have to instrument it. However, if a global does not have initailizer - // at all, we assume it has dynamic initializer (in other TU). - if (G->hasInitializer() && !DynamicallyInitializedGlobals.Contains(G)) + if (!CheckInitOrder || GlobalIsLinkerInitialized(G)) { + NumOptimizedAccessesToGlobalVar++; return; + } + } + ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr); + if (CE && CE->isGEPWithNoNotionalOverIndexing()) { + if (GlobalVariable *G = dyn_cast<GlobalVariable>(CE->getOperand(0))) { + if (CE->getOperand(1)->isNullValue() && GlobalIsLinkerInitialized(G)) { + NumOptimizedAccessesToGlobalArray++; + return; + } + } } } @@ -646,6 +709,11 @@ void AddressSanitizer::instrumentMop(Instruction *I) { assert((TypeSize % 8) == 0); + if (IsWrite) + NumInstrumentedWrites++; + else + NumInstrumentedReads++; + // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check. if (TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 || TypeSize == 128) @@ -861,7 +929,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) { TD = getAnalysisIfAvailable<DataLayout>(); if (!TD) return false; - BL.reset(new BlackList(BlacklistFile)); + BL.reset(SpecialCaseList::createOrDie(BlacklistFile)); if (BL->isIn(M)) return false; C = &(M.getContext()); int LongSize = TD->getPointerSizeInBits(); @@ -892,8 +960,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) { StructType *GlobalStructTy = StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy, NULL); - SmallVector<Constant *, 16> Initializers(n), DynamicInit; - + SmallVector<Constant *, 16> Initializers(n); Function *CtorFunc = M.getFunction(kAsanModuleCtorName); assert(CtorFunc); @@ -929,7 +996,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) { bool GlobalHasDynamicInitializer = DynamicallyInitializedGlobals.Contains(G); // Don't check initialization order if this global is blacklisted. - GlobalHasDynamicInitializer &= !BL->isInInit(*G); + GlobalHasDynamicInitializer &= !BL->isIn(*G, "init"); StructType *NewTy = StructType::get(Ty, RightRedZoneTy, NULL); Constant *NewInitializer = ConstantStruct::get( @@ -939,8 +1006,11 @@ bool AddressSanitizerModule::runOnModule(Module &M) { GlobalVariable *Name = createPrivateGlobalForString(M, G->getName()); // Create a new global variable with enough space for a redzone. + GlobalValue::LinkageTypes Linkage = G->getLinkage(); + if (G->isConstant() && Linkage == GlobalValue::PrivateLinkage) + Linkage = GlobalValue::InternalLinkage; GlobalVariable *NewGlobal = new GlobalVariable( - M, NewTy, G->isConstant(), G->getLinkage(), + M, NewTy, G->isConstant(), Linkage, NewInitializer, "", G, G->getThreadLocalMode()); NewGlobal->copyAttributesFrom(G); NewGlobal->setAlignment(MinRZ); @@ -973,7 +1043,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) { ArrayType *ArrayOfGlobalStructTy = ArrayType::get(GlobalStructTy, n); GlobalVariable *AllGlobals = new GlobalVariable( - M, ArrayOfGlobalStructTy, false, GlobalVariable::PrivateLinkage, + M, ArrayOfGlobalStructTy, false, GlobalVariable::InternalLinkage, ConstantArray::get(ArrayOfGlobalStructTy, Initializers), ""); // Create calls for poisoning before initializers run and unpoisoning after. @@ -1021,6 +1091,8 @@ void AddressSanitizer::initializeCallbacks(Module &M) { AsanHandleNoReturnFunc = checkInterfaceFunction(M.getOrInsertFunction( kAsanHandleNoReturnName, IRB.getVoidTy(), NULL)); + AsanCovFunction = checkInterfaceFunction(M.getOrInsertFunction( + kAsanCovName, IRB.getVoidTy(), IntptrTy, NULL)); // We insert an empty inline asm after __asan_report* to avoid callback merge. EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), StringRef(""), StringRef(""), @@ -1051,7 +1123,7 @@ bool AddressSanitizer::doInitialization(Module &M) { if (!TD) return false; - BL.reset(new BlackList(BlacklistFile)); + BL.reset(SpecialCaseList::createOrDie(BlacklistFile)); DynamicallyInitializedGlobals.Init(M); C = &(M.getContext()); @@ -1092,6 +1164,47 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) { return false; } +// Poor man's coverage that works with ASan. +// We create a Guard boolean variable with the same linkage +// as the function and inject this code into the entry block: +// if (*Guard) { +// __sanitizer_cov(&F); +// *Guard = 1; +// } +// The accesses to Guard are atomic. The rest of the logic is +// in __sanitizer_cov (it's fine to call it more than once). +// +// This coverage implementation provides very limited data: +// it only tells if a given function was ever executed. +// No counters, no per-basic-block or per-edge data. +// But for many use cases this is what we need and the added slowdown +// is negligible. This simple implementation will probably be obsoleted +// by the upcoming Clang-based coverage implementation. +// By having it here and now we hope to +// a) get the functionality to users earlier and +// b) collect usage statistics to help improve Clang coverage design. +bool AddressSanitizer::InjectCoverage(Function &F) { + if (!ClCoverage) return false; + IRBuilder<> IRB(F.getEntryBlock().getFirstInsertionPt()); + Type *Int8Ty = IRB.getInt8Ty(); + GlobalVariable *Guard = new GlobalVariable( + *F.getParent(), Int8Ty, false, GlobalValue::PrivateLinkage, + Constant::getNullValue(Int8Ty), "__asan_gen_cov_" + F.getName()); + LoadInst *Load = IRB.CreateLoad(Guard); + Load->setAtomic(Monotonic); + Load->setAlignment(1); + Value *Cmp = IRB.CreateICmpEQ(Constant::getNullValue(Int8Ty), Load); + Instruction *Ins = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false); + IRB.SetInsertPoint(Ins); + // We pass &F to __sanitizer_cov. We could avoid this and rely on + // GET_CALLER_PC, but having the PC of the first instruction is just nice. + IRB.CreateCall(AsanCovFunction, IRB.CreatePointerCast(&F, IntptrTy)); + StoreInst *Store = IRB.CreateStore(ConstantInt::get(Int8Ty, 1), Guard); + Store->setAtomic(Monotonic); + Store->setAlignment(1); + return true; +} + bool AddressSanitizer::runOnFunction(Function &F) { if (BL->isIn(F)) return false; if (&F == AsanCtorFunction) return false; @@ -1102,8 +1215,7 @@ bool AddressSanitizer::runOnFunction(Function &F) { // If needed, insert __asan_init before checking for SanitizeAddress attr. maybeInsertAsanInitAtFunctionEntry(F); - if (!F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::SanitizeAddress)) + if (!F.hasFnAttribute(Attribute::SanitizeAddress)) return false; if (!ClDebugFunc.empty() && ClDebugFunc != F.getName()) @@ -1114,6 +1226,7 @@ bool AddressSanitizer::runOnFunction(Function &F) { SmallSet<Value*, 16> TempsToInstrument; SmallVector<Instruction*, 16> ToInstrument; SmallVector<Instruction*, 8> NoReturnCalls; + int NumAllocas = 0; bool IsWrite; // Fill the set of memory operations to instrument. @@ -1132,6 +1245,8 @@ bool AddressSanitizer::runOnFunction(Function &F) { } else if (isa<MemIntrinsic>(BI) && ClMemIntrin) { // ok, take it. } else { + if (isa<AllocaInst>(BI)) + NumAllocas++; CallSite CS(BI); if (CS) { // A call inside BB. @@ -1148,6 +1263,17 @@ bool AddressSanitizer::runOnFunction(Function &F) { } } + Function *UninstrumentedDuplicate = 0; + bool LikelyToInstrument = + !NoReturnCalls.empty() || !ToInstrument.empty() || (NumAllocas > 0); + if (ClKeepUninstrumented && LikelyToInstrument) { + ValueToValueMapTy VMap; + UninstrumentedDuplicate = CloneFunction(&F, VMap, false); + UninstrumentedDuplicate->removeFnAttr(Attribute::SanitizeAddress); + UninstrumentedDuplicate->setName("NOASAN_" + F.getName()); + F.getParent()->getFunctionList().push_back(UninstrumentedDuplicate); + } + // Instrument. int NumInstrumented = 0; for (size_t i = 0, n = ToInstrument.size(); i != n; i++) { @@ -1172,9 +1298,29 @@ bool AddressSanitizer::runOnFunction(Function &F) { IRBuilder<> IRB(CI); IRB.CreateCall(AsanHandleNoReturnFunc); } - DEBUG(dbgs() << "ASAN done instrumenting:\n" << F << "\n"); - return NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty(); + bool res = NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty(); + + if (InjectCoverage(F)) + res = true; + + DEBUG(dbgs() << "ASAN done instrumenting: " << res << " " << F << "\n"); + + if (ClKeepUninstrumented) { + if (!res) { + // No instrumentation is done, no need for the duplicate. + if (UninstrumentedDuplicate) + UninstrumentedDuplicate->eraseFromParent(); + } else { + // The function was instrumented. We must have the duplicate. + assert(UninstrumentedDuplicate); + UninstrumentedDuplicate->setSection("NOASAN"); + assert(!F.hasSection()); + F.setSection("ASAN"); + } + } + + return res; } static uint64_t ValueForPoison(uint64_t PoisonByte, size_t ShadowRedzoneSize) { @@ -1217,11 +1363,15 @@ bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) { void FunctionStackPoisoner::initializeCallbacks(Module &M) { IRBuilder<> IRB(*C); - AsanStackMallocFunc = checkInterfaceFunction(M.getOrInsertFunction( - kAsanStackMallocName, IntptrTy, IntptrTy, IntptrTy, NULL)); - AsanStackFreeFunc = checkInterfaceFunction(M.getOrInsertFunction( - kAsanStackFreeName, IRB.getVoidTy(), - IntptrTy, IntptrTy, IntptrTy, NULL)); + for (int i = 0; i <= kMaxAsanStackMallocSizeClass; i++) { + std::string Suffix = itostr(i); + AsanStackMallocFunc[i] = checkInterfaceFunction( + M.getOrInsertFunction(kAsanStackMallocNameTemplate + Suffix, IntptrTy, + IntptrTy, IntptrTy, NULL)); + AsanStackFreeFunc[i] = checkInterfaceFunction(M.getOrInsertFunction( + kAsanStackFreeNameTemplate + Suffix, IRB.getVoidTy(), IntptrTy, + IntptrTy, IntptrTy, NULL)); + } AsanPoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction( kAsanPoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); AsanUnpoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction( @@ -1229,7 +1379,7 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) { } void FunctionStackPoisoner::poisonRedZones( - const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB, Value *ShadowBase, + const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> &IRB, Value *ShadowBase, bool DoPoison) { size_t ShadowRZSize = RedzoneSize() >> Mapping.Scale; assert(ShadowRZSize >= 1 && ShadowRZSize <= 4); @@ -1270,6 +1420,10 @@ void FunctionStackPoisoner::poisonRedZones( RedzoneSize(), 1ULL << Mapping.Scale, kAsanStackPartialRedzoneMagic); + Poison = + ASan.TD->isLittleEndian() + ? support::endian::byte_swap<uint32_t, support::little>(Poison) + : support::endian::byte_swap<uint32_t, support::big>(Poison); } Value *PartialPoison = ConstantInt::get(RZTy, Poison); IRB.CreateStore(PartialPoison, IRB.CreateIntToPtr(Ptr, RZPtrTy)); @@ -1286,12 +1440,40 @@ void FunctionStackPoisoner::poisonRedZones( } } +// Fake stack allocator (asan_fake_stack.h) has 11 size classes +// for every power of 2 from kMinStackMallocSize to kMaxAsanStackMallocSizeClass +static int StackMallocSizeClass(uint64_t LocalStackSize) { + assert(LocalStackSize <= kMaxStackMallocSize); + uint64_t MaxSize = kMinStackMallocSize; + for (int i = 0; ; i++, MaxSize *= 2) + if (LocalStackSize <= MaxSize) + return i; + llvm_unreachable("impossible LocalStackSize"); +} + +// Set Size bytes starting from ShadowBase to kAsanStackAfterReturnMagic. +// We can not use MemSet intrinsic because it may end up calling the actual +// memset. Size is a multiple of 8. +// Currently this generates 8-byte stores on x86_64; it may be better to +// generate wider stores. +void FunctionStackPoisoner::SetShadowToStackAfterReturnInlined( + IRBuilder<> &IRB, Value *ShadowBase, int Size) { + assert(!(Size % 8)); + assert(kAsanStackAfterReturnMagic == 0xf5); + for (int i = 0; i < Size; i += 8) { + Value *p = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i)); + IRB.CreateStore(ConstantInt::get(IRB.getInt64Ty(), 0xf5f5f5f5f5f5f5f5ULL), + IRB.CreateIntToPtr(p, IRB.getInt64Ty()->getPointerTo())); + } +} + void FunctionStackPoisoner::poisonStack() { uint64_t LocalStackSize = TotalStackSize + (AllocaVec.size() + 1) * RedzoneSize(); bool DoStackMalloc = ASan.CheckUseAfterReturn && LocalStackSize <= kMaxStackMallocSize; + int StackMallocIdx = -1; assert(AllocaVec.size() > 0); Instruction *InsBefore = AllocaVec[0]; @@ -1309,8 +1491,28 @@ void FunctionStackPoisoner::poisonStack() { Value *LocalStackBase = OrigStackBase; if (DoStackMalloc) { - LocalStackBase = IRB.CreateCall2(AsanStackMallocFunc, + // LocalStackBase = OrigStackBase + // if (__asan_option_detect_stack_use_after_return) + // LocalStackBase = __asan_stack_malloc_N(LocalStackBase, OrigStackBase); + StackMallocIdx = StackMallocSizeClass(LocalStackSize); + assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass); + Constant *OptionDetectUAR = F.getParent()->getOrInsertGlobal( + kAsanOptionDetectUAR, IRB.getInt32Ty()); + Value *Cmp = IRB.CreateICmpNE(IRB.CreateLoad(OptionDetectUAR), + Constant::getNullValue(IRB.getInt32Ty())); + Instruction *Term = + SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false); + BasicBlock *CmpBlock = cast<Instruction>(Cmp)->getParent(); + IRBuilder<> IRBIf(Term); + LocalStackBase = IRBIf.CreateCall2( + AsanStackMallocFunc[StackMallocIdx], ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase); + BasicBlock *SetBlock = cast<Instruction>(LocalStackBase)->getParent(); + IRB.SetInsertPoint(InsBefore); + PHINode *Phi = IRB.CreatePHI(IntptrTy, 2); + Phi->addIncoming(OrigStackBase, CmpBlock); + Phi->addIncoming(LocalStackBase, SetBlock); + LocalStackBase = Phi; } // This string will be parsed by the run-time (DescribeAddressIfStack). @@ -1322,11 +1524,10 @@ void FunctionStackPoisoner::poisonStack() { bool HavePoisonedAllocas = false; for (size_t i = 0, n = AllocaPoisonCallVec.size(); i < n; i++) { const AllocaPoisonCall &APC = AllocaPoisonCallVec[i]; - IntrinsicInst *II = APC.InsBefore; - AllocaInst *AI = findAllocaForValue(II->getArgOperand(1)); - assert(AI); - IRBuilder<> IRB(II); - poisonAlloca(AI, APC.Size, IRB, APC.DoPoison); + assert(APC.InsBefore); + assert(APC.AI); + IRBuilder<> IRB(APC.InsBefore); + poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison); HavePoisonedAllocas |= APC.DoPoison; } @@ -1384,10 +1585,35 @@ void FunctionStackPoisoner::poisonStack() { // Unpoison the stack. poisonRedZones(AllocaVec, IRBRet, ShadowBase, false); if (DoStackMalloc) { + assert(StackMallocIdx >= 0); // In use-after-return mode, mark the whole stack frame unaddressable. - IRBRet.CreateCall3(AsanStackFreeFunc, LocalStackBase, - ConstantInt::get(IntptrTy, LocalStackSize), - OrigStackBase); + if (StackMallocIdx <= 4) { + // For small sizes inline the whole thing: + // if LocalStackBase != OrigStackBase: + // memset(ShadowBase, kAsanStackAfterReturnMagic, ShadowSize); + // **SavedFlagPtr(LocalStackBase) = 0 + // FIXME: if LocalStackBase != OrigStackBase don't call poisonRedZones. + Value *Cmp = IRBRet.CreateICmpNE(LocalStackBase, OrigStackBase); + TerminatorInst *PoisonTerm = + SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false); + IRBuilder<> IRBPoison(PoisonTerm); + int ClassSize = kMinStackMallocSize << StackMallocIdx; + SetShadowToStackAfterReturnInlined(IRBPoison, ShadowBase, + ClassSize >> Mapping.Scale); + Value *SavedFlagPtrPtr = IRBPoison.CreateAdd( + LocalStackBase, + ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8)); + Value *SavedFlagPtr = IRBPoison.CreateLoad( + IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy)); + IRBPoison.CreateStore( + Constant::getNullValue(IRBPoison.getInt8Ty()), + IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getInt8PtrTy())); + } else { + // For larger frames call __asan_stack_free_*. + IRBRet.CreateCall3(AsanStackFreeFunc[StackMallocIdx], LocalStackBase, + ConstantInt::get(IntptrTy, LocalStackSize), + OrigStackBase); + } } else if (HavePoisonedAllocas) { // If we poisoned some allocas in llvm.lifetime analysis, // unpoison whole stack frame now. @@ -1402,7 +1628,7 @@ void FunctionStackPoisoner::poisonStack() { } void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size, - IRBuilder<> IRB, bool DoPoison) { + IRBuilder<> &IRB, bool DoPoison) { // For now just insert the call to ASan runtime. Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy); Value *SizeArg = ConstantInt::get(IntptrTy, Size); diff --git a/contrib/llvm/lib/Transforms/Instrumentation/BlackList.cpp b/contrib/llvm/lib/Transforms/Instrumentation/BlackList.cpp deleted file mode 100644 index 39de4b0..0000000 --- a/contrib/llvm/lib/Transforms/Instrumentation/BlackList.cpp +++ /dev/null @@ -1,126 +0,0 @@ -//===-- BlackList.cpp - blacklist for sanitizers --------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This is a utility class for instrumentation passes (like AddressSanitizer -// or ThreadSanitizer) to avoid instrumenting some functions or global -// variables based on a user-supplied blacklist. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Transforms/Utils/BlackList.h" -#include "llvm/ADT/OwningPtr.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/Regex.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/system_error.h" -#include <string> -#include <utility> - -namespace llvm { - -BlackList::BlackList(const StringRef Path) { - // Validate and open blacklist file. - if (Path.empty()) return; - OwningPtr<MemoryBuffer> File; - if (error_code EC = MemoryBuffer::getFile(Path, File)) { - report_fatal_error("Can't open blacklist file: " + Path + ": " + - EC.message()); - } - - // Iterate through each line in the blacklist file. - SmallVector<StringRef, 16> Lines; - SplitString(File.take()->getBuffer(), Lines, "\n\r"); - StringMap<std::string> Regexps; - for (SmallVector<StringRef, 16>::iterator I = Lines.begin(), E = Lines.end(); - I != E; ++I) { - // Ignore empty lines and lines starting with "#" - if (I->empty() || I->startswith("#")) - continue; - // Get our prefix and unparsed regexp. - std::pair<StringRef, StringRef> SplitLine = I->split(":"); - StringRef Prefix = SplitLine.first; - std::string Regexp = SplitLine.second; - if (Regexp.empty()) { - // Missing ':' in the line. - report_fatal_error("malformed blacklist line: " + SplitLine.first); - } - - // Replace * with .* - for (size_t pos = 0; (pos = Regexp.find("*", pos)) != std::string::npos; - pos += strlen(".*")) { - Regexp.replace(pos, strlen("*"), ".*"); - } - - // Check that the regexp is valid. - Regex CheckRE(Regexp); - std::string Error; - if (!CheckRE.isValid(Error)) { - report_fatal_error("malformed blacklist regex: " + SplitLine.second + - ": " + Error); - } - - // Add this regexp into the proper group by its prefix. - if (!Regexps[Prefix].empty()) - Regexps[Prefix] += "|"; - Regexps[Prefix] += Regexp; - } - - // Iterate through each of the prefixes, and create Regexs for them. - for (StringMap<std::string>::const_iterator I = Regexps.begin(), - E = Regexps.end(); I != E; ++I) { - Entries[I->getKey()] = new Regex(I->getValue()); - } -} - -bool BlackList::isIn(const Function &F) const { - return isIn(*F.getParent()) || inSection("fun", F.getName()); -} - -bool BlackList::isIn(const GlobalVariable &G) const { - return isIn(*G.getParent()) || inSection("global", G.getName()); -} - -bool BlackList::isIn(const Module &M) const { - return inSection("src", M.getModuleIdentifier()); -} - -static StringRef GetGVTypeString(const GlobalVariable &G) { - // Types of GlobalVariables are always pointer types. - Type *GType = G.getType()->getElementType(); - // For now we support blacklisting struct types only. - if (StructType *SGType = dyn_cast<StructType>(GType)) { - if (!SGType->isLiteral()) - return SGType->getName(); - } - return "<unknown type>"; -} - -bool BlackList::isInInit(const GlobalVariable &G) const { - return (isIn(*G.getParent()) || - inSection("global-init", G.getName()) || - inSection("global-init-type", GetGVTypeString(G)) || - inSection("global-init-src", G.getParent()->getModuleIdentifier())); -} - -bool BlackList::inSection(const StringRef Section, - const StringRef Query) const { - StringMap<Regex*>::const_iterator I = Entries.find(Section); - if (I == Entries.end()) return false; - - Regex *FunctionRegex = I->getValue(); - return FunctionRegex->match(Query); -} - -} // namespace llvm diff --git a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp index b094d42..7a9f0f6 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -80,7 +80,7 @@ BasicBlock *BoundsChecking::getTrapBB() { return TrapBB; Function *Fn = Inst->getParent()->getParent(); - BasicBlock::iterator PrevInsertPoint = Builder->GetInsertPoint(); + IRBuilder<>::InsertPointGuard Guard(*Builder); TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn); Builder->SetInsertPoint(TrapBB); @@ -91,7 +91,6 @@ BasicBlock *BoundsChecking::getTrapBB() { TrapCall->setDebugLoc(Inst->getDebugLoc()); Builder->CreateUnreachable(); - Builder->SetInsertPoint(PrevInsertPoint); return TrapBB; } @@ -173,7 +172,8 @@ bool BoundsChecking::runOnFunction(Function &F) { TrapBB = 0; BuilderTy TheBuilder(F.getContext(), TargetFolder(TD)); Builder = &TheBuilder; - ObjectSizeOffsetEvaluator TheObjSizeEval(TD, TLI, F.getContext()); + ObjectSizeOffsetEvaluator TheObjSizeEval(TD, TLI, F.getContext(), + /*RoundToAlign=*/true); ObjSizeEval = &TheObjSizeEval; // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory diff --git a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp new file mode 100644 index 0000000..9b9e725 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -0,0 +1,1397 @@ +//===-- DataFlowSanitizer.cpp - dynamic data flow analysis ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file is a part of DataFlowSanitizer, a generalised dynamic data flow +/// analysis. +/// +/// Unlike other Sanitizer tools, this tool is not designed to detect a specific +/// class of bugs on its own. Instead, it provides a generic dynamic data flow +/// analysis framework to be used by clients to help detect application-specific +/// issues within their own code. +/// +/// The analysis is based on automatic propagation of data flow labels (also +/// known as taint labels) through a program as it performs computation. Each +/// byte of application memory is backed by two bytes of shadow memory which +/// hold the label. On Linux/x86_64, memory is laid out as follows: +/// +/// +--------------------+ 0x800000000000 (top of memory) +/// | application memory | +/// +--------------------+ 0x700000008000 (kAppAddr) +/// | | +/// | unused | +/// | | +/// +--------------------+ 0x200200000000 (kUnusedAddr) +/// | union table | +/// +--------------------+ 0x200000000000 (kUnionTableAddr) +/// | shadow memory | +/// +--------------------+ 0x000000010000 (kShadowAddr) +/// | reserved by kernel | +/// +--------------------+ 0x000000000000 +/// +/// To derive a shadow memory address from an application memory address, +/// bits 44-46 are cleared to bring the address into the range +/// [0x000000008000,0x100000000000). Then the address is shifted left by 1 to +/// account for the double byte representation of shadow labels and move the +/// address into the shadow memory range. See the function +/// DataFlowSanitizer::getShadowAddress below. +/// +/// For more information, please refer to the design document: +/// http://clang.llvm.org/docs/DataFlowSanitizerDesign.html + +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/InstVisitor.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SpecialCaseList.h" +#include <iterator> + +using namespace llvm; + +// The -dfsan-preserve-alignment flag controls whether this pass assumes that +// alignment requirements provided by the input IR are correct. For example, +// if the input IR contains a load with alignment 8, this flag will cause +// the shadow load to have alignment 16. This flag is disabled by default as +// we have unfortunately encountered too much code (including Clang itself; +// see PR14291) which performs misaligned access. +static cl::opt<bool> ClPreserveAlignment( + "dfsan-preserve-alignment", + cl::desc("respect alignment requirements provided by input IR"), cl::Hidden, + cl::init(false)); + +// The ABI list file controls how shadow parameters are passed. The pass treats +// every function labelled "uninstrumented" in the ABI list file as conforming +// to the "native" (i.e. unsanitized) ABI. Unless the ABI list contains +// additional annotations for those functions, a call to one of those functions +// will produce a warning message, as the labelling behaviour of the function is +// unknown. The other supported annotations are "functional" and "discard", +// which are described below under DataFlowSanitizer::WrapperKind. +static cl::opt<std::string> ClABIListFile( + "dfsan-abilist", + cl::desc("File listing native ABI functions and how the pass treats them"), + cl::Hidden); + +// Controls whether the pass uses IA_Args or IA_TLS as the ABI for instrumented +// functions (see DataFlowSanitizer::InstrumentedABI below). +static cl::opt<bool> ClArgsABI( + "dfsan-args-abi", + cl::desc("Use the argument ABI rather than the TLS ABI"), + cl::Hidden); + +static cl::opt<bool> ClDebugNonzeroLabels( + "dfsan-debug-nonzero-labels", + cl::desc("Insert calls to __dfsan_nonzero_label on observing a parameter, " + "load or return with a nonzero label"), + cl::Hidden); + +namespace { + +class DataFlowSanitizer : public ModulePass { + friend struct DFSanFunction; + friend class DFSanVisitor; + + enum { + ShadowWidth = 16 + }; + + /// Which ABI should be used for instrumented functions? + enum InstrumentedABI { + /// Argument and return value labels are passed through additional + /// arguments and by modifying the return type. + IA_Args, + + /// Argument and return value labels are passed through TLS variables + /// __dfsan_arg_tls and __dfsan_retval_tls. + IA_TLS + }; + + /// How should calls to uninstrumented functions be handled? + enum WrapperKind { + /// This function is present in an uninstrumented form but we don't know + /// how it should be handled. Print a warning and call the function anyway. + /// Don't label the return value. + WK_Warning, + + /// This function does not write to (user-accessible) memory, and its return + /// value is unlabelled. + WK_Discard, + + /// This function does not write to (user-accessible) memory, and the label + /// of its return value is the union of the label of its arguments. + WK_Functional, + + /// Instead of calling the function, a custom wrapper __dfsw_F is called, + /// where F is the name of the function. This function may wrap the + /// original function or provide its own implementation. This is similar to + /// the IA_Args ABI, except that IA_Args uses a struct return type to + /// pass the return value shadow in a register, while WK_Custom uses an + /// extra pointer argument to return the shadow. This allows the wrapped + /// form of the function type to be expressed in C. + WK_Custom + }; + + DataLayout *DL; + Module *Mod; + LLVMContext *Ctx; + IntegerType *ShadowTy; + PointerType *ShadowPtrTy; + IntegerType *IntptrTy; + ConstantInt *ZeroShadow; + ConstantInt *ShadowPtrMask; + ConstantInt *ShadowPtrMul; + Constant *ArgTLS; + Constant *RetvalTLS; + void *(*GetArgTLSPtr)(); + void *(*GetRetvalTLSPtr)(); + Constant *GetArgTLS; + Constant *GetRetvalTLS; + FunctionType *DFSanUnionFnTy; + FunctionType *DFSanUnionLoadFnTy; + FunctionType *DFSanUnimplementedFnTy; + FunctionType *DFSanSetLabelFnTy; + FunctionType *DFSanNonzeroLabelFnTy; + Constant *DFSanUnionFn; + Constant *DFSanUnionLoadFn; + Constant *DFSanUnimplementedFn; + Constant *DFSanSetLabelFn; + Constant *DFSanNonzeroLabelFn; + MDNode *ColdCallWeights; + OwningPtr<SpecialCaseList> ABIList; + DenseMap<Value *, Function *> UnwrappedFnMap; + AttributeSet ReadOnlyNoneAttrs; + + Value *getShadowAddress(Value *Addr, Instruction *Pos); + Value *combineShadows(Value *V1, Value *V2, Instruction *Pos); + bool isInstrumented(const Function *F); + bool isInstrumented(const GlobalAlias *GA); + FunctionType *getArgsFunctionType(FunctionType *T); + FunctionType *getTrampolineFunctionType(FunctionType *T); + FunctionType *getCustomFunctionType(FunctionType *T); + InstrumentedABI getInstrumentedABI(); + WrapperKind getWrapperKind(Function *F); + void addGlobalNamePrefix(GlobalValue *GV); + Function *buildWrapperFunction(Function *F, StringRef NewFName, + GlobalValue::LinkageTypes NewFLink, + FunctionType *NewFT); + Constant *getOrBuildTrampolineFunction(FunctionType *FT, StringRef FName); + + public: + DataFlowSanitizer(StringRef ABIListFile = StringRef(), + void *(*getArgTLS)() = 0, void *(*getRetValTLS)() = 0); + static char ID; + bool doInitialization(Module &M); + bool runOnModule(Module &M); +}; + +struct DFSanFunction { + DataFlowSanitizer &DFS; + Function *F; + DataFlowSanitizer::InstrumentedABI IA; + bool IsNativeABI; + Value *ArgTLSPtr; + Value *RetvalTLSPtr; + AllocaInst *LabelReturnAlloca; + DenseMap<Value *, Value *> ValShadowMap; + DenseMap<AllocaInst *, AllocaInst *> AllocaShadowMap; + std::vector<std::pair<PHINode *, PHINode *> > PHIFixups; + DenseSet<Instruction *> SkipInsts; + DenseSet<Value *> NonZeroChecks; + + DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI) + : DFS(DFS), F(F), IA(DFS.getInstrumentedABI()), + IsNativeABI(IsNativeABI), ArgTLSPtr(0), RetvalTLSPtr(0), + LabelReturnAlloca(0) {} + Value *getArgTLSPtr(); + Value *getArgTLS(unsigned Index, Instruction *Pos); + Value *getRetvalTLS(); + Value *getShadow(Value *V); + void setShadow(Instruction *I, Value *Shadow); + Value *combineOperandShadows(Instruction *Inst); + Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align, + Instruction *Pos); + void storeShadow(Value *Addr, uint64_t Size, uint64_t Align, Value *Shadow, + Instruction *Pos); +}; + +class DFSanVisitor : public InstVisitor<DFSanVisitor> { + public: + DFSanFunction &DFSF; + DFSanVisitor(DFSanFunction &DFSF) : DFSF(DFSF) {} + + void visitOperandShadowInst(Instruction &I); + + void visitBinaryOperator(BinaryOperator &BO); + void visitCastInst(CastInst &CI); + void visitCmpInst(CmpInst &CI); + void visitGetElementPtrInst(GetElementPtrInst &GEPI); + void visitLoadInst(LoadInst &LI); + void visitStoreInst(StoreInst &SI); + void visitReturnInst(ReturnInst &RI); + void visitCallSite(CallSite CS); + void visitPHINode(PHINode &PN); + void visitExtractElementInst(ExtractElementInst &I); + void visitInsertElementInst(InsertElementInst &I); + void visitShuffleVectorInst(ShuffleVectorInst &I); + void visitExtractValueInst(ExtractValueInst &I); + void visitInsertValueInst(InsertValueInst &I); + void visitAllocaInst(AllocaInst &I); + void visitSelectInst(SelectInst &I); + void visitMemSetInst(MemSetInst &I); + void visitMemTransferInst(MemTransferInst &I); +}; + +} + +char DataFlowSanitizer::ID; +INITIALIZE_PASS(DataFlowSanitizer, "dfsan", + "DataFlowSanitizer: dynamic data flow analysis.", false, false) + +ModulePass *llvm::createDataFlowSanitizerPass(StringRef ABIListFile, + void *(*getArgTLS)(), + void *(*getRetValTLS)()) { + return new DataFlowSanitizer(ABIListFile, getArgTLS, getRetValTLS); +} + +DataFlowSanitizer::DataFlowSanitizer(StringRef ABIListFile, + void *(*getArgTLS)(), + void *(*getRetValTLS)()) + : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS), + ABIList(SpecialCaseList::createOrDie(ABIListFile.empty() ? ClABIListFile + : ABIListFile)) { +} + +FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) { + llvm::SmallVector<Type *, 4> ArgTypes; + std::copy(T->param_begin(), T->param_end(), std::back_inserter(ArgTypes)); + for (unsigned i = 0, e = T->getNumParams(); i != e; ++i) + ArgTypes.push_back(ShadowTy); + if (T->isVarArg()) + ArgTypes.push_back(ShadowPtrTy); + Type *RetType = T->getReturnType(); + if (!RetType->isVoidTy()) + RetType = StructType::get(RetType, ShadowTy, (Type *)0); + return FunctionType::get(RetType, ArgTypes, T->isVarArg()); +} + +FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) { + assert(!T->isVarArg()); + llvm::SmallVector<Type *, 4> ArgTypes; + ArgTypes.push_back(T->getPointerTo()); + std::copy(T->param_begin(), T->param_end(), std::back_inserter(ArgTypes)); + for (unsigned i = 0, e = T->getNumParams(); i != e; ++i) + ArgTypes.push_back(ShadowTy); + Type *RetType = T->getReturnType(); + if (!RetType->isVoidTy()) + ArgTypes.push_back(ShadowPtrTy); + return FunctionType::get(T->getReturnType(), ArgTypes, false); +} + +FunctionType *DataFlowSanitizer::getCustomFunctionType(FunctionType *T) { + assert(!T->isVarArg()); + llvm::SmallVector<Type *, 4> ArgTypes; + for (FunctionType::param_iterator i = T->param_begin(), e = T->param_end(); + i != e; ++i) { + FunctionType *FT; + if (isa<PointerType>(*i) && (FT = dyn_cast<FunctionType>(cast<PointerType>( + *i)->getElementType()))) { + ArgTypes.push_back(getTrampolineFunctionType(FT)->getPointerTo()); + ArgTypes.push_back(Type::getInt8PtrTy(*Ctx)); + } else { + ArgTypes.push_back(*i); + } + } + for (unsigned i = 0, e = T->getNumParams(); i != e; ++i) + ArgTypes.push_back(ShadowTy); + Type *RetType = T->getReturnType(); + if (!RetType->isVoidTy()) + ArgTypes.push_back(ShadowPtrTy); + return FunctionType::get(T->getReturnType(), ArgTypes, false); +} + +bool DataFlowSanitizer::doInitialization(Module &M) { + DL = getAnalysisIfAvailable<DataLayout>(); + if (!DL) + return false; + + Mod = &M; + Ctx = &M.getContext(); + ShadowTy = IntegerType::get(*Ctx, ShadowWidth); + ShadowPtrTy = PointerType::getUnqual(ShadowTy); + IntptrTy = DL->getIntPtrType(*Ctx); + ZeroShadow = ConstantInt::getSigned(ShadowTy, 0); + ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL); + ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidth / 8); + + Type *DFSanUnionArgs[2] = { ShadowTy, ShadowTy }; + DFSanUnionFnTy = + FunctionType::get(ShadowTy, DFSanUnionArgs, /*isVarArg=*/ false); + Type *DFSanUnionLoadArgs[2] = { ShadowPtrTy, IntptrTy }; + DFSanUnionLoadFnTy = + FunctionType::get(ShadowTy, DFSanUnionLoadArgs, /*isVarArg=*/ false); + DFSanUnimplementedFnTy = FunctionType::get( + Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false); + Type *DFSanSetLabelArgs[3] = { ShadowTy, Type::getInt8PtrTy(*Ctx), IntptrTy }; + DFSanSetLabelFnTy = FunctionType::get(Type::getVoidTy(*Ctx), + DFSanSetLabelArgs, /*isVarArg=*/false); + DFSanNonzeroLabelFnTy = FunctionType::get( + Type::getVoidTy(*Ctx), ArrayRef<Type *>(), /*isVarArg=*/false); + + if (GetArgTLSPtr) { + Type *ArgTLSTy = ArrayType::get(ShadowTy, 64); + ArgTLS = 0; + GetArgTLS = ConstantExpr::getIntToPtr( + ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)), + PointerType::getUnqual( + FunctionType::get(PointerType::getUnqual(ArgTLSTy), (Type *)0))); + } + if (GetRetvalTLSPtr) { + RetvalTLS = 0; + GetRetvalTLS = ConstantExpr::getIntToPtr( + ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)), + PointerType::getUnqual( + FunctionType::get(PointerType::getUnqual(ShadowTy), (Type *)0))); + } + + ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000); + return true; +} + +bool DataFlowSanitizer::isInstrumented(const Function *F) { + return !ABIList->isIn(*F, "uninstrumented"); +} + +bool DataFlowSanitizer::isInstrumented(const GlobalAlias *GA) { + return !ABIList->isIn(*GA, "uninstrumented"); +} + +DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() { + return ClArgsABI ? IA_Args : IA_TLS; +} + +DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) { + if (ABIList->isIn(*F, "functional")) + return WK_Functional; + if (ABIList->isIn(*F, "discard")) + return WK_Discard; + if (ABIList->isIn(*F, "custom")) + return WK_Custom; + + return WK_Warning; +} + +void DataFlowSanitizer::addGlobalNamePrefix(GlobalValue *GV) { + std::string GVName = GV->getName(), Prefix = "dfs$"; + GV->setName(Prefix + GVName); + + // Try to change the name of the function in module inline asm. We only do + // this for specific asm directives, currently only ".symver", to try to avoid + // corrupting asm which happens to contain the symbol name as a substring. + // Note that the substitution for .symver assumes that the versioned symbol + // also has an instrumented name. + std::string Asm = GV->getParent()->getModuleInlineAsm(); + std::string SearchStr = ".symver " + GVName + ","; + size_t Pos = Asm.find(SearchStr); + if (Pos != std::string::npos) { + Asm.replace(Pos, SearchStr.size(), + ".symver " + Prefix + GVName + "," + Prefix); + GV->getParent()->setModuleInlineAsm(Asm); + } +} + +Function * +DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName, + GlobalValue::LinkageTypes NewFLink, + FunctionType *NewFT) { + FunctionType *FT = F->getFunctionType(); + Function *NewF = Function::Create(NewFT, NewFLink, NewFName, + F->getParent()); + NewF->copyAttributesFrom(F); + NewF->removeAttributes( + AttributeSet::ReturnIndex, + AttributeFuncs::typeIncompatible(NewFT->getReturnType(), + AttributeSet::ReturnIndex)); + + BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF); + std::vector<Value *> Args; + unsigned n = FT->getNumParams(); + for (Function::arg_iterator ai = NewF->arg_begin(); n != 0; ++ai, --n) + Args.push_back(&*ai); + CallInst *CI = CallInst::Create(F, Args, "", BB); + if (FT->getReturnType()->isVoidTy()) + ReturnInst::Create(*Ctx, BB); + else + ReturnInst::Create(*Ctx, CI, BB); + + return NewF; +} + +Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT, + StringRef FName) { + FunctionType *FTT = getTrampolineFunctionType(FT); + Constant *C = Mod->getOrInsertFunction(FName, FTT); + Function *F = dyn_cast<Function>(C); + if (F && F->isDeclaration()) { + F->setLinkage(GlobalValue::LinkOnceODRLinkage); + BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F); + std::vector<Value *> Args; + Function::arg_iterator AI = F->arg_begin(); ++AI; + for (unsigned N = FT->getNumParams(); N != 0; ++AI, --N) + Args.push_back(&*AI); + CallInst *CI = + CallInst::Create(&F->getArgumentList().front(), Args, "", BB); + ReturnInst *RI; + if (FT->getReturnType()->isVoidTy()) + RI = ReturnInst::Create(*Ctx, BB); + else + RI = ReturnInst::Create(*Ctx, CI, BB); + + DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true); + Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI; + for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N) + DFSF.ValShadowMap[ValAI] = ShadowAI; + DFSanVisitor(DFSF).visitCallInst(*CI); + if (!FT->getReturnType()->isVoidTy()) + new StoreInst(DFSF.getShadow(RI->getReturnValue()), + &F->getArgumentList().back(), RI); + } + + return C; +} + +bool DataFlowSanitizer::runOnModule(Module &M) { + if (!DL) + return false; + + if (ABIList->isIn(M, "skip")) + return false; + + if (!GetArgTLSPtr) { + Type *ArgTLSTy = ArrayType::get(ShadowTy, 64); + ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy); + if (GlobalVariable *G = dyn_cast<GlobalVariable>(ArgTLS)) + G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel); + } + if (!GetRetvalTLSPtr) { + RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", ShadowTy); + if (GlobalVariable *G = dyn_cast<GlobalVariable>(RetvalTLS)) + G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel); + } + + DFSanUnionFn = Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy); + if (Function *F = dyn_cast<Function>(DFSanUnionFn)) { + F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone); + F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); + F->addAttribute(1, Attribute::ZExt); + F->addAttribute(2, Attribute::ZExt); + } + DFSanUnionLoadFn = + Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy); + if (Function *F = dyn_cast<Function>(DFSanUnionLoadFn)) { + F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); + } + DFSanUnimplementedFn = + Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy); + DFSanSetLabelFn = + Mod->getOrInsertFunction("__dfsan_set_label", DFSanSetLabelFnTy); + if (Function *F = dyn_cast<Function>(DFSanSetLabelFn)) { + F->addAttribute(1, Attribute::ZExt); + } + DFSanNonzeroLabelFn = + Mod->getOrInsertFunction("__dfsan_nonzero_label", DFSanNonzeroLabelFnTy); + + std::vector<Function *> FnsToInstrument; + llvm::SmallPtrSet<Function *, 2> FnsWithNativeABI; + for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) { + if (!i->isIntrinsic() && + i != DFSanUnionFn && + i != DFSanUnionLoadFn && + i != DFSanUnimplementedFn && + i != DFSanSetLabelFn && + i != DFSanNonzeroLabelFn) + FnsToInstrument.push_back(&*i); + } + + // Give function aliases prefixes when necessary, and build wrappers where the + // instrumentedness is inconsistent. + for (Module::alias_iterator i = M.alias_begin(), e = M.alias_end(); i != e;) { + GlobalAlias *GA = &*i; + ++i; + // Don't stop on weak. We assume people aren't playing games with the + // instrumentedness of overridden weak aliases. + if (Function *F = dyn_cast<Function>( + GA->resolveAliasedGlobal(/*stopOnWeak=*/false))) { + bool GAInst = isInstrumented(GA), FInst = isInstrumented(F); + if (GAInst && FInst) { + addGlobalNamePrefix(GA); + } else if (GAInst != FInst) { + // Non-instrumented alias of an instrumented function, or vice versa. + // Replace the alias with a native-ABI wrapper of the aliasee. The pass + // below will take care of instrumenting it. + Function *NewF = + buildWrapperFunction(F, "", GA->getLinkage(), F->getFunctionType()); + GA->replaceAllUsesWith(NewF); + NewF->takeName(GA); + GA->eraseFromParent(); + FnsToInstrument.push_back(NewF); + } + } + } + + AttrBuilder B; + B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone); + ReadOnlyNoneAttrs = AttributeSet::get(*Ctx, AttributeSet::FunctionIndex, B); + + // First, change the ABI of every function in the module. ABI-listed + // functions keep their original ABI and get a wrapper function. + for (std::vector<Function *>::iterator i = FnsToInstrument.begin(), + e = FnsToInstrument.end(); + i != e; ++i) { + Function &F = **i; + FunctionType *FT = F.getFunctionType(); + + bool IsZeroArgsVoidRet = (FT->getNumParams() == 0 && !FT->isVarArg() && + FT->getReturnType()->isVoidTy()); + + if (isInstrumented(&F)) { + // Instrumented functions get a 'dfs$' prefix. This allows us to more + // easily identify cases of mismatching ABIs. + if (getInstrumentedABI() == IA_Args && !IsZeroArgsVoidRet) { + FunctionType *NewFT = getArgsFunctionType(FT); + Function *NewF = Function::Create(NewFT, F.getLinkage(), "", &M); + NewF->copyAttributesFrom(&F); + NewF->removeAttributes( + AttributeSet::ReturnIndex, + AttributeFuncs::typeIncompatible(NewFT->getReturnType(), + AttributeSet::ReturnIndex)); + for (Function::arg_iterator FArg = F.arg_begin(), + NewFArg = NewF->arg_begin(), + FArgEnd = F.arg_end(); + FArg != FArgEnd; ++FArg, ++NewFArg) { + FArg->replaceAllUsesWith(NewFArg); + } + NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList()); + + for (Function::use_iterator ui = F.use_begin(), ue = F.use_end(); + ui != ue;) { + BlockAddress *BA = dyn_cast<BlockAddress>(ui.getUse().getUser()); + ++ui; + if (BA) { + BA->replaceAllUsesWith( + BlockAddress::get(NewF, BA->getBasicBlock())); + delete BA; + } + } + F.replaceAllUsesWith( + ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT))); + NewF->takeName(&F); + F.eraseFromParent(); + *i = NewF; + addGlobalNamePrefix(NewF); + } else { + addGlobalNamePrefix(&F); + } + // Hopefully, nobody will try to indirectly call a vararg + // function... yet. + } else if (FT->isVarArg()) { + UnwrappedFnMap[&F] = &F; + *i = 0; + } else if (!IsZeroArgsVoidRet || getWrapperKind(&F) == WK_Custom) { + // Build a wrapper function for F. The wrapper simply calls F, and is + // added to FnsToInstrument so that any instrumentation according to its + // WrapperKind is done in the second pass below. + FunctionType *NewFT = getInstrumentedABI() == IA_Args + ? getArgsFunctionType(FT) + : FT; + Function *NewF = buildWrapperFunction( + &F, std::string("dfsw$") + std::string(F.getName()), + GlobalValue::LinkOnceODRLinkage, NewFT); + if (getInstrumentedABI() == IA_TLS) + NewF->removeAttributes(AttributeSet::FunctionIndex, ReadOnlyNoneAttrs); + + Value *WrappedFnCst = + ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT)); + F.replaceAllUsesWith(WrappedFnCst); + UnwrappedFnMap[WrappedFnCst] = &F; + *i = NewF; + + if (!F.isDeclaration()) { + // This function is probably defining an interposition of an + // uninstrumented function and hence needs to keep the original ABI. + // But any functions it may call need to use the instrumented ABI, so + // we instrument it in a mode which preserves the original ABI. + FnsWithNativeABI.insert(&F); + + // This code needs to rebuild the iterators, as they may be invalidated + // by the push_back, taking care that the new range does not include + // any functions added by this code. + size_t N = i - FnsToInstrument.begin(), + Count = e - FnsToInstrument.begin(); + FnsToInstrument.push_back(&F); + i = FnsToInstrument.begin() + N; + e = FnsToInstrument.begin() + Count; + } + } + } + + for (std::vector<Function *>::iterator i = FnsToInstrument.begin(), + e = FnsToInstrument.end(); + i != e; ++i) { + if (!*i || (*i)->isDeclaration()) + continue; + + removeUnreachableBlocks(**i); + + DFSanFunction DFSF(*this, *i, FnsWithNativeABI.count(*i)); + + // DFSanVisitor may create new basic blocks, which confuses df_iterator. + // Build a copy of the list before iterating over it. + llvm::SmallVector<BasicBlock *, 4> BBList; + std::copy(df_begin(&(*i)->getEntryBlock()), df_end(&(*i)->getEntryBlock()), + std::back_inserter(BBList)); + + for (llvm::SmallVector<BasicBlock *, 4>::iterator i = BBList.begin(), + e = BBList.end(); + i != e; ++i) { + Instruction *Inst = &(*i)->front(); + while (1) { + // DFSanVisitor may split the current basic block, changing the current + // instruction's next pointer and moving the next instruction to the + // tail block from which we should continue. + Instruction *Next = Inst->getNextNode(); + // DFSanVisitor may delete Inst, so keep track of whether it was a + // terminator. + bool IsTerminator = isa<TerminatorInst>(Inst); + if (!DFSF.SkipInsts.count(Inst)) + DFSanVisitor(DFSF).visit(Inst); + if (IsTerminator) + break; + Inst = Next; + } + } + + // We will not necessarily be able to compute the shadow for every phi node + // until we have visited every block. Therefore, the code that handles phi + // nodes adds them to the PHIFixups list so that they can be properly + // handled here. + for (std::vector<std::pair<PHINode *, PHINode *> >::iterator + i = DFSF.PHIFixups.begin(), + e = DFSF.PHIFixups.end(); + i != e; ++i) { + for (unsigned val = 0, n = i->first->getNumIncomingValues(); val != n; + ++val) { + i->second->setIncomingValue( + val, DFSF.getShadow(i->first->getIncomingValue(val))); + } + } + + // -dfsan-debug-nonzero-labels will split the CFG in all kinds of crazy + // places (i.e. instructions in basic blocks we haven't even begun visiting + // yet). To make our life easier, do this work in a pass after the main + // instrumentation. + if (ClDebugNonzeroLabels) { + for (DenseSet<Value *>::iterator i = DFSF.NonZeroChecks.begin(), + e = DFSF.NonZeroChecks.end(); + i != e; ++i) { + Instruction *Pos; + if (Instruction *I = dyn_cast<Instruction>(*i)) + Pos = I->getNextNode(); + else + Pos = DFSF.F->getEntryBlock().begin(); + while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos)) + Pos = Pos->getNextNode(); + IRBuilder<> IRB(Pos); + Instruction *NeInst = cast<Instruction>( + IRB.CreateICmpNE(*i, DFSF.DFS.ZeroShadow)); + BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen( + NeInst, /*Unreachable=*/ false, ColdCallWeights)); + IRBuilder<> ThenIRB(BI); + ThenIRB.CreateCall(DFSF.DFS.DFSanNonzeroLabelFn); + } + } + } + + return false; +} + +Value *DFSanFunction::getArgTLSPtr() { + if (ArgTLSPtr) + return ArgTLSPtr; + if (DFS.ArgTLS) + return ArgTLSPtr = DFS.ArgTLS; + + IRBuilder<> IRB(F->getEntryBlock().begin()); + return ArgTLSPtr = IRB.CreateCall(DFS.GetArgTLS); +} + +Value *DFSanFunction::getRetvalTLS() { + if (RetvalTLSPtr) + return RetvalTLSPtr; + if (DFS.RetvalTLS) + return RetvalTLSPtr = DFS.RetvalTLS; + + IRBuilder<> IRB(F->getEntryBlock().begin()); + return RetvalTLSPtr = IRB.CreateCall(DFS.GetRetvalTLS); +} + +Value *DFSanFunction::getArgTLS(unsigned Idx, Instruction *Pos) { + IRBuilder<> IRB(Pos); + return IRB.CreateConstGEP2_64(getArgTLSPtr(), 0, Idx); +} + +Value *DFSanFunction::getShadow(Value *V) { + if (!isa<Argument>(V) && !isa<Instruction>(V)) + return DFS.ZeroShadow; + Value *&Shadow = ValShadowMap[V]; + if (!Shadow) { + if (Argument *A = dyn_cast<Argument>(V)) { + if (IsNativeABI) + return DFS.ZeroShadow; + switch (IA) { + case DataFlowSanitizer::IA_TLS: { + Value *ArgTLSPtr = getArgTLSPtr(); + Instruction *ArgTLSPos = + DFS.ArgTLS ? &*F->getEntryBlock().begin() + : cast<Instruction>(ArgTLSPtr)->getNextNode(); + IRBuilder<> IRB(ArgTLSPos); + Shadow = IRB.CreateLoad(getArgTLS(A->getArgNo(), ArgTLSPos)); + break; + } + case DataFlowSanitizer::IA_Args: { + unsigned ArgIdx = A->getArgNo() + F->getArgumentList().size() / 2; + Function::arg_iterator i = F->arg_begin(); + while (ArgIdx--) + ++i; + Shadow = i; + assert(Shadow->getType() == DFS.ShadowTy); + break; + } + } + NonZeroChecks.insert(Shadow); + } else { + Shadow = DFS.ZeroShadow; + } + } + return Shadow; +} + +void DFSanFunction::setShadow(Instruction *I, Value *Shadow) { + assert(!ValShadowMap.count(I)); + assert(Shadow->getType() == DFS.ShadowTy); + ValShadowMap[I] = Shadow; +} + +Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) { + assert(Addr != RetvalTLS && "Reinstrumenting?"); + IRBuilder<> IRB(Pos); + return IRB.CreateIntToPtr( + IRB.CreateMul( + IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy), ShadowPtrMask), + ShadowPtrMul), + ShadowPtrTy); +} + +// Generates IR to compute the union of the two given shadows, inserting it +// before Pos. Returns the computed union Value. +Value *DataFlowSanitizer::combineShadows(Value *V1, Value *V2, + Instruction *Pos) { + if (V1 == ZeroShadow) + return V2; + if (V2 == ZeroShadow) + return V1; + if (V1 == V2) + return V1; + IRBuilder<> IRB(Pos); + BasicBlock *Head = Pos->getParent(); + Value *Ne = IRB.CreateICmpNE(V1, V2); + Instruction *NeInst = dyn_cast<Instruction>(Ne); + if (NeInst) { + BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen( + NeInst, /*Unreachable=*/ false, ColdCallWeights)); + IRBuilder<> ThenIRB(BI); + CallInst *Call = ThenIRB.CreateCall2(DFSanUnionFn, V1, V2); + Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); + Call->addAttribute(1, Attribute::ZExt); + Call->addAttribute(2, Attribute::ZExt); + + BasicBlock *Tail = BI->getSuccessor(0); + PHINode *Phi = PHINode::Create(ShadowTy, 2, "", Tail->begin()); + Phi->addIncoming(Call, Call->getParent()); + Phi->addIncoming(V1, Head); + Pos = Phi; + return Phi; + } else { + assert(0 && "todo"); + return 0; + } +} + +// A convenience function which folds the shadows of each of the operands +// of the provided instruction Inst, inserting the IR before Inst. Returns +// the computed union Value. +Value *DFSanFunction::combineOperandShadows(Instruction *Inst) { + if (Inst->getNumOperands() == 0) + return DFS.ZeroShadow; + + Value *Shadow = getShadow(Inst->getOperand(0)); + for (unsigned i = 1, n = Inst->getNumOperands(); i != n; ++i) { + Shadow = DFS.combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst); + } + return Shadow; +} + +void DFSanVisitor::visitOperandShadowInst(Instruction &I) { + Value *CombinedShadow = DFSF.combineOperandShadows(&I); + DFSF.setShadow(&I, CombinedShadow); +} + +// Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where +// Addr has alignment Align, and take the union of each of those shadows. +Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, + Instruction *Pos) { + if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) { + llvm::DenseMap<AllocaInst *, AllocaInst *>::iterator i = + AllocaShadowMap.find(AI); + if (i != AllocaShadowMap.end()) { + IRBuilder<> IRB(Pos); + return IRB.CreateLoad(i->second); + } + } + + uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8; + SmallVector<Value *, 2> Objs; + GetUnderlyingObjects(Addr, Objs, DFS.DL); + bool AllConstants = true; + for (SmallVector<Value *, 2>::iterator i = Objs.begin(), e = Objs.end(); + i != e; ++i) { + if (isa<Function>(*i) || isa<BlockAddress>(*i)) + continue; + if (isa<GlobalVariable>(*i) && cast<GlobalVariable>(*i)->isConstant()) + continue; + + AllConstants = false; + break; + } + if (AllConstants) + return DFS.ZeroShadow; + + Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos); + switch (Size) { + case 0: + return DFS.ZeroShadow; + case 1: { + LoadInst *LI = new LoadInst(ShadowAddr, "", Pos); + LI->setAlignment(ShadowAlign); + return LI; + } + case 2: { + IRBuilder<> IRB(Pos); + Value *ShadowAddr1 = + IRB.CreateGEP(ShadowAddr, ConstantInt::get(DFS.IntptrTy, 1)); + return DFS.combineShadows(IRB.CreateAlignedLoad(ShadowAddr, ShadowAlign), + IRB.CreateAlignedLoad(ShadowAddr1, ShadowAlign), + Pos); + } + } + if (Size % (64 / DFS.ShadowWidth) == 0) { + // Fast path for the common case where each byte has identical shadow: load + // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any + // shadow is non-equal. + BasicBlock *FallbackBB = BasicBlock::Create(*DFS.Ctx, "", F); + IRBuilder<> FallbackIRB(FallbackBB); + CallInst *FallbackCall = FallbackIRB.CreateCall2( + DFS.DFSanUnionLoadFn, ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)); + FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); + + // Compare each of the shadows stored in the loaded 64 bits to each other, + // by computing (WideShadow rotl ShadowWidth) == WideShadow. + IRBuilder<> IRB(Pos); + Value *WideAddr = + IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx)); + Value *WideShadow = IRB.CreateAlignedLoad(WideAddr, ShadowAlign); + Value *TruncShadow = IRB.CreateTrunc(WideShadow, DFS.ShadowTy); + Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidth); + Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidth); + Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow); + Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow); + + BasicBlock *Head = Pos->getParent(); + BasicBlock *Tail = Head->splitBasicBlock(Pos); + // In the following code LastBr will refer to the previous basic block's + // conditional branch instruction, whose true successor is fixed up to point + // to the next block during the loop below or to the tail after the final + // iteration. + BranchInst *LastBr = BranchInst::Create(FallbackBB, FallbackBB, ShadowsEq); + ReplaceInstWithInst(Head->getTerminator(), LastBr); + + for (uint64_t Ofs = 64 / DFS.ShadowWidth; Ofs != Size; + Ofs += 64 / DFS.ShadowWidth) { + BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F); + IRBuilder<> NextIRB(NextBB); + WideAddr = NextIRB.CreateGEP(WideAddr, ConstantInt::get(DFS.IntptrTy, 1)); + Value *NextWideShadow = NextIRB.CreateAlignedLoad(WideAddr, ShadowAlign); + ShadowsEq = NextIRB.CreateICmpEQ(WideShadow, NextWideShadow); + LastBr->setSuccessor(0, NextBB); + LastBr = NextIRB.CreateCondBr(ShadowsEq, FallbackBB, FallbackBB); + } + + LastBr->setSuccessor(0, Tail); + FallbackIRB.CreateBr(Tail); + PHINode *Shadow = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front()); + Shadow->addIncoming(FallbackCall, FallbackBB); + Shadow->addIncoming(TruncShadow, LastBr->getParent()); + return Shadow; + } + + IRBuilder<> IRB(Pos); + CallInst *FallbackCall = IRB.CreateCall2( + DFS.DFSanUnionLoadFn, ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)); + FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); + return FallbackCall; +} + +void DFSanVisitor::visitLoadInst(LoadInst &LI) { + uint64_t Size = DFSF.DFS.DL->getTypeStoreSize(LI.getType()); + uint64_t Align; + if (ClPreserveAlignment) { + Align = LI.getAlignment(); + if (Align == 0) + Align = DFSF.DFS.DL->getABITypeAlignment(LI.getType()); + } else { + Align = 1; + } + IRBuilder<> IRB(&LI); + Value *LoadedShadow = + DFSF.loadShadow(LI.getPointerOperand(), Size, Align, &LI); + Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand()); + Value *CombinedShadow = DFSF.DFS.combineShadows(LoadedShadow, PtrShadow, &LI); + if (CombinedShadow != DFSF.DFS.ZeroShadow) + DFSF.NonZeroChecks.insert(CombinedShadow); + + DFSF.setShadow(&LI, CombinedShadow); +} + +void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, uint64_t Align, + Value *Shadow, Instruction *Pos) { + if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) { + llvm::DenseMap<AllocaInst *, AllocaInst *>::iterator i = + AllocaShadowMap.find(AI); + if (i != AllocaShadowMap.end()) { + IRBuilder<> IRB(Pos); + IRB.CreateStore(Shadow, i->second); + return; + } + } + + uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8; + IRBuilder<> IRB(Pos); + Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos); + if (Shadow == DFS.ZeroShadow) { + IntegerType *ShadowTy = IntegerType::get(*DFS.Ctx, Size * DFS.ShadowWidth); + Value *ExtZeroShadow = ConstantInt::get(ShadowTy, 0); + Value *ExtShadowAddr = + IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowTy)); + IRB.CreateAlignedStore(ExtZeroShadow, ExtShadowAddr, ShadowAlign); + return; + } + + const unsigned ShadowVecSize = 128 / DFS.ShadowWidth; + uint64_t Offset = 0; + if (Size >= ShadowVecSize) { + VectorType *ShadowVecTy = VectorType::get(DFS.ShadowTy, ShadowVecSize); + Value *ShadowVec = UndefValue::get(ShadowVecTy); + for (unsigned i = 0; i != ShadowVecSize; ++i) { + ShadowVec = IRB.CreateInsertElement( + ShadowVec, Shadow, ConstantInt::get(Type::getInt32Ty(*DFS.Ctx), i)); + } + Value *ShadowVecAddr = + IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowVecTy)); + do { + Value *CurShadowVecAddr = IRB.CreateConstGEP1_32(ShadowVecAddr, Offset); + IRB.CreateAlignedStore(ShadowVec, CurShadowVecAddr, ShadowAlign); + Size -= ShadowVecSize; + ++Offset; + } while (Size >= ShadowVecSize); + Offset *= ShadowVecSize; + } + while (Size > 0) { + Value *CurShadowAddr = IRB.CreateConstGEP1_32(ShadowAddr, Offset); + IRB.CreateAlignedStore(Shadow, CurShadowAddr, ShadowAlign); + --Size; + ++Offset; + } +} + +void DFSanVisitor::visitStoreInst(StoreInst &SI) { + uint64_t Size = + DFSF.DFS.DL->getTypeStoreSize(SI.getValueOperand()->getType()); + uint64_t Align; + if (ClPreserveAlignment) { + Align = SI.getAlignment(); + if (Align == 0) + Align = DFSF.DFS.DL->getABITypeAlignment(SI.getValueOperand()->getType()); + } else { + Align = 1; + } + DFSF.storeShadow(SI.getPointerOperand(), Size, Align, + DFSF.getShadow(SI.getValueOperand()), &SI); +} + +void DFSanVisitor::visitBinaryOperator(BinaryOperator &BO) { + visitOperandShadowInst(BO); +} + +void DFSanVisitor::visitCastInst(CastInst &CI) { visitOperandShadowInst(CI); } + +void DFSanVisitor::visitCmpInst(CmpInst &CI) { visitOperandShadowInst(CI); } + +void DFSanVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) { + visitOperandShadowInst(GEPI); +} + +void DFSanVisitor::visitExtractElementInst(ExtractElementInst &I) { + visitOperandShadowInst(I); +} + +void DFSanVisitor::visitInsertElementInst(InsertElementInst &I) { + visitOperandShadowInst(I); +} + +void DFSanVisitor::visitShuffleVectorInst(ShuffleVectorInst &I) { + visitOperandShadowInst(I); +} + +void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) { + visitOperandShadowInst(I); +} + +void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) { + visitOperandShadowInst(I); +} + +void DFSanVisitor::visitAllocaInst(AllocaInst &I) { + bool AllLoadsStores = true; + for (Instruction::use_iterator i = I.use_begin(), e = I.use_end(); i != e; + ++i) { + if (isa<LoadInst>(*i)) + continue; + + if (StoreInst *SI = dyn_cast<StoreInst>(*i)) { + if (SI->getPointerOperand() == &I) + continue; + } + + AllLoadsStores = false; + break; + } + if (AllLoadsStores) { + IRBuilder<> IRB(&I); + DFSF.AllocaShadowMap[&I] = IRB.CreateAlloca(DFSF.DFS.ShadowTy); + } + DFSF.setShadow(&I, DFSF.DFS.ZeroShadow); +} + +void DFSanVisitor::visitSelectInst(SelectInst &I) { + Value *CondShadow = DFSF.getShadow(I.getCondition()); + Value *TrueShadow = DFSF.getShadow(I.getTrueValue()); + Value *FalseShadow = DFSF.getShadow(I.getFalseValue()); + + if (isa<VectorType>(I.getCondition()->getType())) { + DFSF.setShadow( + &I, DFSF.DFS.combineShadows( + CondShadow, + DFSF.DFS.combineShadows(TrueShadow, FalseShadow, &I), &I)); + } else { + Value *ShadowSel; + if (TrueShadow == FalseShadow) { + ShadowSel = TrueShadow; + } else { + ShadowSel = + SelectInst::Create(I.getCondition(), TrueShadow, FalseShadow, "", &I); + } + DFSF.setShadow(&I, DFSF.DFS.combineShadows(CondShadow, ShadowSel, &I)); + } +} + +void DFSanVisitor::visitMemSetInst(MemSetInst &I) { + IRBuilder<> IRB(&I); + Value *ValShadow = DFSF.getShadow(I.getValue()); + IRB.CreateCall3( + DFSF.DFS.DFSanSetLabelFn, ValShadow, + IRB.CreateBitCast(I.getDest(), Type::getInt8PtrTy(*DFSF.DFS.Ctx)), + IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy)); +} + +void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) { + IRBuilder<> IRB(&I); + Value *DestShadow = DFSF.DFS.getShadowAddress(I.getDest(), &I); + Value *SrcShadow = DFSF.DFS.getShadowAddress(I.getSource(), &I); + Value *LenShadow = IRB.CreateMul( + I.getLength(), + ConstantInt::get(I.getLength()->getType(), DFSF.DFS.ShadowWidth / 8)); + Value *AlignShadow; + if (ClPreserveAlignment) { + AlignShadow = IRB.CreateMul(I.getAlignmentCst(), + ConstantInt::get(I.getAlignmentCst()->getType(), + DFSF.DFS.ShadowWidth / 8)); + } else { + AlignShadow = ConstantInt::get(I.getAlignmentCst()->getType(), + DFSF.DFS.ShadowWidth / 8); + } + Type *Int8Ptr = Type::getInt8PtrTy(*DFSF.DFS.Ctx); + DestShadow = IRB.CreateBitCast(DestShadow, Int8Ptr); + SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr); + IRB.CreateCall5(I.getCalledValue(), DestShadow, SrcShadow, LenShadow, + AlignShadow, I.getVolatileCst()); +} + +void DFSanVisitor::visitReturnInst(ReturnInst &RI) { + if (!DFSF.IsNativeABI && RI.getReturnValue()) { + switch (DFSF.IA) { + case DataFlowSanitizer::IA_TLS: { + Value *S = DFSF.getShadow(RI.getReturnValue()); + IRBuilder<> IRB(&RI); + IRB.CreateStore(S, DFSF.getRetvalTLS()); + break; + } + case DataFlowSanitizer::IA_Args: { + IRBuilder<> IRB(&RI); + Type *RT = DFSF.F->getFunctionType()->getReturnType(); + Value *InsVal = + IRB.CreateInsertValue(UndefValue::get(RT), RI.getReturnValue(), 0); + Value *InsShadow = + IRB.CreateInsertValue(InsVal, DFSF.getShadow(RI.getReturnValue()), 1); + RI.setOperand(0, InsShadow); + break; + } + } + } +} + +void DFSanVisitor::visitCallSite(CallSite CS) { + Function *F = CS.getCalledFunction(); + if ((F && F->isIntrinsic()) || isa<InlineAsm>(CS.getCalledValue())) { + visitOperandShadowInst(*CS.getInstruction()); + return; + } + + IRBuilder<> IRB(CS.getInstruction()); + + DenseMap<Value *, Function *>::iterator i = + DFSF.DFS.UnwrappedFnMap.find(CS.getCalledValue()); + if (i != DFSF.DFS.UnwrappedFnMap.end()) { + Function *F = i->second; + switch (DFSF.DFS.getWrapperKind(F)) { + case DataFlowSanitizer::WK_Warning: { + CS.setCalledFunction(F); + IRB.CreateCall(DFSF.DFS.DFSanUnimplementedFn, + IRB.CreateGlobalStringPtr(F->getName())); + DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow); + return; + } + case DataFlowSanitizer::WK_Discard: { + CS.setCalledFunction(F); + DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow); + return; + } + case DataFlowSanitizer::WK_Functional: { + CS.setCalledFunction(F); + visitOperandShadowInst(*CS.getInstruction()); + return; + } + case DataFlowSanitizer::WK_Custom: { + // Don't try to handle invokes of custom functions, it's too complicated. + // Instead, invoke the dfsw$ wrapper, which will in turn call the __dfsw_ + // wrapper. + if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) { + FunctionType *FT = F->getFunctionType(); + FunctionType *CustomFT = DFSF.DFS.getCustomFunctionType(FT); + std::string CustomFName = "__dfsw_"; + CustomFName += F->getName(); + Constant *CustomF = + DFSF.DFS.Mod->getOrInsertFunction(CustomFName, CustomFT); + if (Function *CustomFn = dyn_cast<Function>(CustomF)) { + CustomFn->copyAttributesFrom(F); + + // Custom functions returning non-void will write to the return label. + if (!FT->getReturnType()->isVoidTy()) { + CustomFn->removeAttributes(AttributeSet::FunctionIndex, + DFSF.DFS.ReadOnlyNoneAttrs); + } + } + + std::vector<Value *> Args; + + CallSite::arg_iterator i = CS.arg_begin(); + for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) { + Type *T = (*i)->getType(); + FunctionType *ParamFT; + if (isa<PointerType>(T) && + (ParamFT = dyn_cast<FunctionType>( + cast<PointerType>(T)->getElementType()))) { + std::string TName = "dfst"; + TName += utostr(FT->getNumParams() - n); + TName += "$"; + TName += F->getName(); + Constant *T = DFSF.DFS.getOrBuildTrampolineFunction(ParamFT, TName); + Args.push_back(T); + Args.push_back( + IRB.CreateBitCast(*i, Type::getInt8PtrTy(*DFSF.DFS.Ctx))); + } else { + Args.push_back(*i); + } + } + + i = CS.arg_begin(); + for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) + Args.push_back(DFSF.getShadow(*i)); + + if (!FT->getReturnType()->isVoidTy()) { + if (!DFSF.LabelReturnAlloca) { + DFSF.LabelReturnAlloca = + new AllocaInst(DFSF.DFS.ShadowTy, "labelreturn", + DFSF.F->getEntryBlock().begin()); + } + Args.push_back(DFSF.LabelReturnAlloca); + } + + CallInst *CustomCI = IRB.CreateCall(CustomF, Args); + CustomCI->setCallingConv(CI->getCallingConv()); + CustomCI->setAttributes(CI->getAttributes()); + + if (!FT->getReturnType()->isVoidTy()) { + LoadInst *LabelLoad = IRB.CreateLoad(DFSF.LabelReturnAlloca); + DFSF.setShadow(CustomCI, LabelLoad); + } + + CI->replaceAllUsesWith(CustomCI); + CI->eraseFromParent(); + return; + } + break; + } + } + } + + FunctionType *FT = cast<FunctionType>( + CS.getCalledValue()->getType()->getPointerElementType()); + if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) { + for (unsigned i = 0, n = FT->getNumParams(); i != n; ++i) { + IRB.CreateStore(DFSF.getShadow(CS.getArgument(i)), + DFSF.getArgTLS(i, CS.getInstruction())); + } + } + + Instruction *Next = 0; + if (!CS.getType()->isVoidTy()) { + if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) { + if (II->getNormalDest()->getSinglePredecessor()) { + Next = II->getNormalDest()->begin(); + } else { + BasicBlock *NewBB = + SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DFS); + Next = NewBB->begin(); + } + } else { + Next = CS->getNextNode(); + } + + if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) { + IRBuilder<> NextIRB(Next); + LoadInst *LI = NextIRB.CreateLoad(DFSF.getRetvalTLS()); + DFSF.SkipInsts.insert(LI); + DFSF.setShadow(CS.getInstruction(), LI); + DFSF.NonZeroChecks.insert(LI); + } + } + + // Do all instrumentation for IA_Args down here to defer tampering with the + // CFG in a way that SplitEdge may be able to detect. + if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_Args) { + FunctionType *NewFT = DFSF.DFS.getArgsFunctionType(FT); + Value *Func = + IRB.CreateBitCast(CS.getCalledValue(), PointerType::getUnqual(NewFT)); + std::vector<Value *> Args; + + CallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); + for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) + Args.push_back(*i); + + i = CS.arg_begin(); + for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) + Args.push_back(DFSF.getShadow(*i)); + + if (FT->isVarArg()) { + unsigned VarArgSize = CS.arg_size() - FT->getNumParams(); + ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize); + AllocaInst *VarArgShadow = + new AllocaInst(VarArgArrayTy, "", DFSF.F->getEntryBlock().begin()); + Args.push_back(IRB.CreateConstGEP2_32(VarArgShadow, 0, 0)); + for (unsigned n = 0; i != e; ++i, ++n) { + IRB.CreateStore(DFSF.getShadow(*i), + IRB.CreateConstGEP2_32(VarArgShadow, 0, n)); + Args.push_back(*i); + } + } + + CallSite NewCS; + if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) { + NewCS = IRB.CreateInvoke(Func, II->getNormalDest(), II->getUnwindDest(), + Args); + } else { + NewCS = IRB.CreateCall(Func, Args); + } + NewCS.setCallingConv(CS.getCallingConv()); + NewCS.setAttributes(CS.getAttributes().removeAttributes( + *DFSF.DFS.Ctx, AttributeSet::ReturnIndex, + AttributeFuncs::typeIncompatible(NewCS.getInstruction()->getType(), + AttributeSet::ReturnIndex))); + + if (Next) { + ExtractValueInst *ExVal = + ExtractValueInst::Create(NewCS.getInstruction(), 0, "", Next); + DFSF.SkipInsts.insert(ExVal); + ExtractValueInst *ExShadow = + ExtractValueInst::Create(NewCS.getInstruction(), 1, "", Next); + DFSF.SkipInsts.insert(ExShadow); + DFSF.setShadow(ExVal, ExShadow); + DFSF.NonZeroChecks.insert(ExShadow); + + CS.getInstruction()->replaceAllUsesWith(ExVal); + } + + CS.getInstruction()->eraseFromParent(); + } +} + +void DFSanVisitor::visitPHINode(PHINode &PN) { + PHINode *ShadowPN = + PHINode::Create(DFSF.DFS.ShadowTy, PN.getNumIncomingValues(), "", &PN); + + // Give the shadow phi node valid predecessors to fool SplitEdge into working. + Value *UndefShadow = UndefValue::get(DFSF.DFS.ShadowTy); + for (PHINode::block_iterator i = PN.block_begin(), e = PN.block_end(); i != e; + ++i) { + ShadowPN->addIncoming(UndefShadow, *i); + } + + DFSF.PHIFixups.push_back(std::make_pair(&PN, ShadowPN)); + DFSF.setShadow(&PN, ShadowPN); +} diff --git a/contrib/llvm/lib/Transforms/Instrumentation/DebugIR.cpp b/contrib/llvm/lib/Transforms/Instrumentation/DebugIR.cpp new file mode 100644 index 0000000..f50a044 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Instrumentation/DebugIR.cpp @@ -0,0 +1,618 @@ +//===--- DebugIR.cpp - Transform debug metadata to allow debugging IR -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// A Module transform pass that emits a succinct version of the IR and replaces +// the source file metadata to allow debuggers to step through the IR. +// +// FIXME: instead of replacing debug metadata, this pass should allow for +// additional metadata to be used to point capable debuggers to the IR file +// without destroying the mapping to the original source file. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "debug-ir" + +#include "llvm/ADT/ValueMap.h" +#include "llvm/Assembly/AssemblyAnnotationWriter.h" +#include "llvm/DebugInfo.h" +#include "llvm/DIBuilder.h" +#include "llvm/InstVisitor.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ToolOutputFile.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" + +#include "DebugIR.h" + +#include <string> + +#define STR_HELPER(x) #x +#define STR(x) STR_HELPER(x) + +using namespace llvm; + +namespace { + +/// Builds a map of Value* to line numbers on which the Value appears in a +/// textual representation of the IR by plugging into the AssemblyWriter by +/// masquerading as an AssemblyAnnotationWriter. +class ValueToLineMap : public AssemblyAnnotationWriter { + ValueMap<const Value *, unsigned int> Lines; + typedef ValueMap<const Value *, unsigned int>::const_iterator LineIter; + + void addEntry(const Value *V, formatted_raw_ostream &Out) { + Out.flush(); + Lines.insert(std::make_pair(V, Out.getLine() + 1)); + } + +public: + + /// Prints Module to a null buffer in order to build the map of Value pointers + /// to line numbers. + ValueToLineMap(const Module *M) { + raw_null_ostream ThrowAway; + M->print(ThrowAway, this); + } + + // This function is called after an Instruction, GlobalValue, or GlobalAlias + // is printed. + void printInfoComment(const Value &V, formatted_raw_ostream &Out) { + addEntry(&V, Out); + } + + void emitFunctionAnnot(const Function *F, formatted_raw_ostream &Out) { + addEntry(F, Out); + } + + /// If V appears on a line in the textual IR representation, sets Line to the + /// line number and returns true, otherwise returns false. + bool getLine(const Value *V, unsigned int &Line) const { + LineIter i = Lines.find(V); + if (i != Lines.end()) { + Line = i->second; + return true; + } + return false; + } +}; + +/// Removes debug intrisncs like llvm.dbg.declare and llvm.dbg.value. +class DebugIntrinsicsRemover : public InstVisitor<DebugIntrinsicsRemover> { + void remove(Instruction &I) { I.eraseFromParent(); } + +public: + static void process(Module &M) { + DebugIntrinsicsRemover Remover; + Remover.visit(&M); + } + void visitDbgDeclareInst(DbgDeclareInst &I) { remove(I); } + void visitDbgValueInst(DbgValueInst &I) { remove(I); } + void visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) { remove(I); } +}; + +/// Removes debug metadata (!dbg) nodes from all instructions, and optionally +/// metadata named "llvm.dbg.cu" if RemoveNamedInfo is true. +class DebugMetadataRemover : public InstVisitor<DebugMetadataRemover> { + bool RemoveNamedInfo; + +public: + static void process(Module &M, bool RemoveNamedInfo = true) { + DebugMetadataRemover Remover(RemoveNamedInfo); + Remover.run(&M); + } + + DebugMetadataRemover(bool RemoveNamedInfo) + : RemoveNamedInfo(RemoveNamedInfo) {} + + void visitInstruction(Instruction &I) { + if (I.getMetadata(LLVMContext::MD_dbg)) + I.setMetadata(LLVMContext::MD_dbg, 0); + } + + void run(Module *M) { + // Remove debug metadata attached to instructions + visit(M); + + if (RemoveNamedInfo) { + // Remove CU named metadata (and all children nodes) + NamedMDNode *Node = M->getNamedMetadata("llvm.dbg.cu"); + if (Node) + M->eraseNamedMetadata(Node); + } + } +}; + +/// Updates debug metadata in a Module: +/// - changes Filename/Directory to values provided on construction +/// - adds/updates line number (DebugLoc) entries associated with each +/// instruction to reflect the instruction's location in an LLVM IR file +class DIUpdater : public InstVisitor<DIUpdater> { + /// Builder of debug information + DIBuilder Builder; + + /// Helper for type attributes/sizes/etc + DataLayout Layout; + + /// Map of Value* to line numbers + const ValueToLineMap LineTable; + + /// Map of Value* (in original Module) to Value* (in optional cloned Module) + const ValueToValueMapTy *VMap; + + /// Directory of debug metadata + DebugInfoFinder Finder; + + /// Source filename and directory + StringRef Filename; + StringRef Directory; + + // CU nodes needed when creating DI subprograms + MDNode *FileNode; + MDNode *LexicalBlockFileNode; + const MDNode *CUNode; + + ValueMap<const Function *, MDNode *> SubprogramDescriptors; + DenseMap<const Type *, MDNode *> TypeDescriptors; + +public: + DIUpdater(Module &M, StringRef Filename = StringRef(), + StringRef Directory = StringRef(), const Module *DisplayM = 0, + const ValueToValueMapTy *VMap = 0) + : Builder(M), Layout(&M), LineTable(DisplayM ? DisplayM : &M), VMap(VMap), + Finder(), Filename(Filename), Directory(Directory), FileNode(0), + LexicalBlockFileNode(0), CUNode(0) { + Finder.processModule(M); + visit(&M); + } + + ~DIUpdater() { Builder.finalize(); } + + void visitModule(Module &M) { + if (Finder.compile_unit_count() > 1) + report_fatal_error("DebugIR pass supports only a signle compile unit per " + "Module."); + createCompileUnit( + Finder.compile_unit_count() == 1 ? *Finder.compile_unit_begin() : 0); + } + + void visitFunction(Function &F) { + if (F.isDeclaration() || findDISubprogram(&F)) + return; + + StringRef MangledName = F.getName(); + DICompositeType Sig = createFunctionSignature(&F); + + // find line of function declaration + unsigned Line = 0; + if (!findLine(&F, Line)) { + DEBUG(dbgs() << "WARNING: No line for Function " << F.getName().str() + << "\n"); + return; + } + + Instruction *FirstInst = F.begin()->begin(); + unsigned ScopeLine = 0; + if (!findLine(FirstInst, ScopeLine)) { + DEBUG(dbgs() << "WARNING: No line for 1st Instruction in Function " + << F.getName().str() << "\n"); + return; + } + + bool Local = F.hasInternalLinkage(); + bool IsDefinition = !F.isDeclaration(); + bool IsOptimized = false; + + int FuncFlags = llvm::DIDescriptor::FlagPrototyped; + assert(CUNode && FileNode); + DISubprogram Sub = Builder.createFunction( + DICompileUnit(CUNode), F.getName(), MangledName, DIFile(FileNode), Line, + Sig, Local, IsDefinition, ScopeLine, FuncFlags, IsOptimized, &F); + assert(Sub.isSubprogram()); + DEBUG(dbgs() << "create subprogram mdnode " << *Sub << ": " + << "\n"); + + SubprogramDescriptors.insert(std::make_pair(&F, Sub)); + } + + void visitInstruction(Instruction &I) { + DebugLoc Loc(I.getDebugLoc()); + + /// If a ValueToValueMap is provided, use it to get the real instruction as + /// the line table was generated on a clone of the module on which we are + /// operating. + Value *RealInst = 0; + if (VMap) + RealInst = VMap->lookup(&I); + + if (!RealInst) + RealInst = &I; + + unsigned Col = 0; // FIXME: support columns + unsigned Line; + if (!LineTable.getLine(RealInst, Line)) { + // Instruction has no line, it may have been removed (in the module that + // will be passed to the debugger) so there is nothing to do here. + DEBUG(dbgs() << "WARNING: no LineTable entry for instruction " << RealInst + << "\n"); + DEBUG(RealInst->dump()); + return; + } + + DebugLoc NewLoc; + if (!Loc.isUnknown()) + // I had a previous debug location: re-use the DebugLoc + NewLoc = DebugLoc::get(Line, Col, Loc.getScope(RealInst->getContext()), + Loc.getInlinedAt(RealInst->getContext())); + else if (MDNode *scope = findScope(&I)) + NewLoc = DebugLoc::get(Line, Col, scope, 0); + else { + DEBUG(dbgs() << "WARNING: no valid scope for instruction " << &I + << ". no DebugLoc will be present." + << "\n"); + return; + } + + addDebugLocation(I, NewLoc); + } + +private: + + void createCompileUnit(MDNode *CUToReplace) { + std::string Flags; + bool IsOptimized = false; + StringRef Producer; + unsigned RuntimeVersion(0); + StringRef SplitName; + + if (CUToReplace) { + // save fields from existing CU to re-use in the new CU + DICompileUnit ExistingCU(CUToReplace); + Producer = ExistingCU.getProducer(); + IsOptimized = ExistingCU.isOptimized(); + Flags = ExistingCU.getFlags(); + RuntimeVersion = ExistingCU.getRunTimeVersion(); + SplitName = ExistingCU.getSplitDebugFilename(); + } else { + Producer = + "LLVM Version " STR(LLVM_VERSION_MAJOR) "." STR(LLVM_VERSION_MINOR); + } + + CUNode = + Builder.createCompileUnit(dwarf::DW_LANG_C99, Filename, Directory, + Producer, IsOptimized, Flags, RuntimeVersion); + + if (CUToReplace) + CUToReplace->replaceAllUsesWith(const_cast<MDNode *>(CUNode)); + + DICompileUnit CU(CUNode); + FileNode = Builder.createFile(Filename, Directory); + LexicalBlockFileNode = Builder.createLexicalBlockFile(CU, DIFile(FileNode)); + } + + /// Returns the MDNode* that represents the DI scope to associate with I + MDNode *findScope(const Instruction *I) { + const Function *F = I->getParent()->getParent(); + if (MDNode *ret = findDISubprogram(F)) + return ret; + + DEBUG(dbgs() << "WARNING: Using fallback lexical block file scope " + << LexicalBlockFileNode << " as scope for instruction " << I + << "\n"); + return LexicalBlockFileNode; + } + + /// Returns the MDNode* that is the descriptor for F + MDNode *findDISubprogram(const Function *F) { + typedef ValueMap<const Function *, MDNode *>::const_iterator FuncNodeIter; + FuncNodeIter i = SubprogramDescriptors.find(F); + if (i != SubprogramDescriptors.end()) + return i->second; + + DEBUG(dbgs() << "searching for DI scope node for Function " << F + << " in a list of " << Finder.subprogram_count() + << " subprogram nodes" + << "\n"); + + for (DebugInfoFinder::iterator i = Finder.subprogram_begin(), + e = Finder.subprogram_end(); + i != e; ++i) { + DISubprogram S(*i); + if (S.getFunction() == F) { + DEBUG(dbgs() << "Found DISubprogram " << *i << " for function " + << S.getFunction() << "\n"); + return *i; + } + } + DEBUG(dbgs() << "unable to find DISubprogram node for function " + << F->getName().str() << "\n"); + return 0; + } + + /// Sets Line to the line number on which V appears and returns true. If a + /// line location for V is not found, returns false. + bool findLine(const Value *V, unsigned &Line) { + if (LineTable.getLine(V, Line)) + return true; + + if (VMap) { + Value *mapped = VMap->lookup(V); + if (mapped && LineTable.getLine(mapped, Line)) + return true; + } + return false; + } + + std::string getTypeName(Type *T) { + std::string TypeName; + raw_string_ostream TypeStream(TypeName); + T->print(TypeStream); + TypeStream.flush(); + return TypeName; + } + + /// Returns the MDNode that represents type T if it is already created, or 0 + /// if it is not. + MDNode *getType(const Type *T) { + typedef DenseMap<const Type *, MDNode *>::const_iterator TypeNodeIter; + TypeNodeIter i = TypeDescriptors.find(T); + if (i != TypeDescriptors.end()) + return i->second; + return 0; + } + + /// Returns a DebugInfo type from an LLVM type T. + DIDerivedType getOrCreateType(Type *T) { + MDNode *N = getType(T); + if (N) + return DIDerivedType(N); + else if (T->isVoidTy()) + return DIDerivedType(0); + else if (T->isStructTy()) { + N = Builder.createStructType( + DIScope(LexicalBlockFileNode), T->getStructName(), DIFile(FileNode), + 0, Layout.getTypeSizeInBits(T), Layout.getABITypeAlignment(T), 0, + DIType(0), DIArray(0)); // filled in later + + // N is added to the map (early) so that element search below can find it, + // so as to avoid infinite recursion for structs that contain pointers to + // their own type. + TypeDescriptors[T] = N; + DICompositeType StructDescriptor(N); + + SmallVector<Value *, 4> Elements; + for (unsigned i = 0; i < T->getStructNumElements(); ++i) + Elements.push_back(getOrCreateType(T->getStructElementType(i))); + + // set struct elements + StructDescriptor.setTypeArray(Builder.getOrCreateArray(Elements)); + } else if (T->isPointerTy()) { + Type *PointeeTy = T->getPointerElementType(); + if (!(N = getType(PointeeTy))) + N = Builder.createPointerType( + getOrCreateType(PointeeTy), Layout.getPointerTypeSizeInBits(T), + Layout.getPrefTypeAlignment(T), getTypeName(T)); + } else if (T->isArrayTy()) { + SmallVector<Value *, 1> Subrange; + Subrange.push_back( + Builder.getOrCreateSubrange(0, T->getArrayNumElements() - 1)); + + N = Builder.createArrayType(Layout.getTypeSizeInBits(T), + Layout.getPrefTypeAlignment(T), + getOrCreateType(T->getArrayElementType()), + Builder.getOrCreateArray(Subrange)); + } else { + int encoding = llvm::dwarf::DW_ATE_signed; + if (T->isIntegerTy()) + encoding = llvm::dwarf::DW_ATE_unsigned; + else if (T->isFloatingPointTy()) + encoding = llvm::dwarf::DW_ATE_float; + + N = Builder.createBasicType(getTypeName(T), T->getPrimitiveSizeInBits(), + 0, encoding); + } + TypeDescriptors[T] = N; + return DIDerivedType(N); + } + + /// Returns a DebugInfo type that represents a function signature for Func. + DICompositeType createFunctionSignature(const Function *Func) { + SmallVector<Value *, 4> Params; + DIDerivedType ReturnType(getOrCreateType(Func->getReturnType())); + Params.push_back(ReturnType); + + const Function::ArgumentListType &Args(Func->getArgumentList()); + for (Function::ArgumentListType::const_iterator i = Args.begin(), + e = Args.end(); + i != e; ++i) { + Type *T(i->getType()); + Params.push_back(getOrCreateType(T)); + } + + DIArray ParamArray = Builder.getOrCreateArray(Params); + return Builder.createSubroutineType(DIFile(FileNode), ParamArray); + } + + /// Associates Instruction I with debug location Loc. + void addDebugLocation(Instruction &I, DebugLoc Loc) { + MDNode *MD = Loc.getAsMDNode(I.getContext()); + I.setMetadata(LLVMContext::MD_dbg, MD); + } +}; + +/// Sets Filename/Directory from the Module identifier and returns true, or +/// false if source information is not present. +bool getSourceInfoFromModule(const Module &M, std::string &Directory, + std::string &Filename) { + std::string PathStr(M.getModuleIdentifier()); + if (PathStr.length() == 0 || PathStr == "<stdin>") + return false; + + Filename = sys::path::filename(PathStr); + SmallVector<char, 16> Path(PathStr.begin(), PathStr.end()); + sys::path::remove_filename(Path); + Directory = StringRef(Path.data(), Path.size()); + return true; +} + +// Sets Filename/Directory from debug information in M and returns true, or +// false if no debug information available, or cannot be parsed. +bool getSourceInfoFromDI(const Module &M, std::string &Directory, + std::string &Filename) { + NamedMDNode *CUNode = M.getNamedMetadata("llvm.dbg.cu"); + if (!CUNode || CUNode->getNumOperands() == 0) + return false; + + DICompileUnit CU(CUNode->getOperand(0)); + if (!CU.Verify()) + return false; + + Filename = CU.getFilename(); + Directory = CU.getDirectory(); + return true; +} + +} // anonymous namespace + +namespace llvm { + +bool DebugIR::getSourceInfo(const Module &M) { + ParsedPath = getSourceInfoFromDI(M, Directory, Filename) || + getSourceInfoFromModule(M, Directory, Filename); + return ParsedPath; +} + +bool DebugIR::updateExtension(StringRef NewExtension) { + size_t dot = Filename.find_last_of("."); + if (dot == std::string::npos) + return false; + + Filename.erase(dot); + Filename += NewExtension.str(); + return true; +} + +void DebugIR::generateFilename(OwningPtr<int> &fd) { + SmallVector<char, 16> PathVec; + fd.reset(new int); + sys::fs::createTemporaryFile("debug-ir", "ll", *fd, PathVec); + StringRef Path(PathVec.data(), PathVec.size()); + Filename = sys::path::filename(Path); + sys::path::remove_filename(PathVec); + Directory = StringRef(PathVec.data(), PathVec.size()); + + GeneratedPath = true; +} + +std::string DebugIR::getPath() { + SmallVector<char, 16> Path; + sys::path::append(Path, Directory, Filename); + Path.resize(Filename.size() + Directory.size() + 2); + Path[Filename.size() + Directory.size() + 1] = '\0'; + return std::string(Path.data()); +} + +void DebugIR::writeDebugBitcode(const Module *M, int *fd) { + OwningPtr<raw_fd_ostream> Out; + std::string error; + + if (!fd) { + std::string Path = getPath(); + Out.reset(new raw_fd_ostream(Path.c_str(), error)); + DEBUG(dbgs() << "WRITING debug bitcode from Module " << M << " to file " + << Path << "\n"); + } else { + DEBUG(dbgs() << "WRITING debug bitcode from Module " << M << " to fd " + << *fd << "\n"); + Out.reset(new raw_fd_ostream(*fd, true)); + } + + M->print(*Out, 0); + Out->close(); +} + +void DebugIR::createDebugInfo(Module &M, OwningPtr<Module> &DisplayM) { + if (M.getFunctionList().size() == 0) + // no functions -- no debug info needed + return; + + OwningPtr<ValueToValueMapTy> VMap; + + if (WriteSourceToDisk && (HideDebugIntrinsics || HideDebugMetadata)) { + VMap.reset(new ValueToValueMapTy); + DisplayM.reset(CloneModule(&M, *VMap)); + + if (HideDebugIntrinsics) + DebugIntrinsicsRemover::process(*DisplayM); + + if (HideDebugMetadata) + DebugMetadataRemover::process(*DisplayM); + } + + DIUpdater R(M, Filename, Directory, DisplayM.get(), VMap.get()); +} + +bool DebugIR::isMissingPath() { return Filename.empty() || Directory.empty(); } + +bool DebugIR::runOnModule(Module &M) { + OwningPtr<int> fd; + + if (isMissingPath() && !getSourceInfo(M)) { + if (!WriteSourceToDisk) + report_fatal_error("DebugIR unable to determine file name in input. " + "Ensure Module contains an identifier, a valid " + "DICompileUnit, or construct DebugIR with " + "non-empty Filename/Directory parameters."); + else + generateFilename(fd); + } + + if (!GeneratedPath && WriteSourceToDisk) + updateExtension(".debug-ll"); + + // Clear line numbers. Keep debug info (if any) if we were able to read the + // file name from the DICompileUnit descriptor. + DebugMetadataRemover::process(M, !ParsedPath); + + OwningPtr<Module> DisplayM; + createDebugInfo(M, DisplayM); + if (WriteSourceToDisk) { + Module *OutputM = DisplayM.get() ? DisplayM.get() : &M; + writeDebugBitcode(OutputM, fd.get()); + } + + DEBUG(M.dump()); + return true; +} + +bool DebugIR::runOnModule(Module &M, std::string &Path) { + bool result = runOnModule(M); + Path = getPath(); + return result; +} + +} // llvm namespace + +char DebugIR::ID = 0; +INITIALIZE_PASS(DebugIR, "debug-ir", "Enable debugging IR", false, false) + +ModulePass *llvm::createDebugIRPass(bool HideDebugIntrinsics, + bool HideDebugMetadata, StringRef Directory, + StringRef Filename) { + return new DebugIR(HideDebugIntrinsics, HideDebugMetadata, Directory, + Filename); +} + +ModulePass *llvm::createDebugIRPass() { return new DebugIR(); } diff --git a/contrib/llvm/lib/Transforms/Instrumentation/DebugIR.h b/contrib/llvm/lib/Transforms/Instrumentation/DebugIR.h new file mode 100644 index 0000000..13774cf --- /dev/null +++ b/contrib/llvm/lib/Transforms/Instrumentation/DebugIR.h @@ -0,0 +1,99 @@ +//===- llvm/Transforms/Instrumentation/DebugIR.h - Interface ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interface of the DebugIR pass. For most users, +// including Instrumentation.h and calling createDebugIRPass() is sufficient and +// there is no need to include this file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_DEBUGIR_H +#define LLVM_TRANSFORMS_INSTRUMENTATION_DEBUGIR_H + +#include "llvm/ADT/OwningPtr.h" +#include "llvm/Pass.h" + +namespace llvm { + +class DebugIR : public llvm::ModulePass { + /// If true, write a source file to disk. + bool WriteSourceToDisk; + + /// Hide certain (non-essential) debug information (only relevant if + /// createSource is true. + bool HideDebugIntrinsics; + bool HideDebugMetadata; + + /// The location of the source file. + std::string Directory; + std::string Filename; + + /// True if a temporary file name was generated. + bool GeneratedPath; + + /// True if the file name was read from the Module. + bool ParsedPath; + +public: + static char ID; + + const char *getPassName() const { return "DebugIR"; } + + /// Generate a file on disk to be displayed in a debugger. If Filename and + /// Directory are empty, a temporary path will be generated. + DebugIR(bool HideDebugIntrinsics, bool HideDebugMetadata, + llvm::StringRef Directory, llvm::StringRef Filename) + : ModulePass(ID), WriteSourceToDisk(true), + HideDebugIntrinsics(HideDebugIntrinsics), + HideDebugMetadata(HideDebugMetadata), Directory(Directory), + Filename(Filename), GeneratedPath(false), ParsedPath(false) {} + + /// Modify input in-place; do not generate additional files, and do not hide + /// any debug intrinsics/metadata that might be present. + DebugIR() + : ModulePass(ID), WriteSourceToDisk(false), HideDebugIntrinsics(false), + HideDebugMetadata(false), GeneratedPath(false), ParsedPath(false) {} + + /// Run pass on M and set Path to the source file path in the output module. + bool runOnModule(llvm::Module &M, std::string &Path); + bool runOnModule(llvm::Module &M); + +private: + + /// Returns the concatenated Directory + Filename, without error checking + std::string getPath(); + + /// Attempts to read source information from debug information in M, and if + /// that fails, from M's identifier. Returns true on success, false otherwise. + bool getSourceInfo(const llvm::Module &M); + + /// Replace the extension of Filename with NewExtension, and return true if + /// successful. Return false if extension could not be found or Filename is + /// empty. + bool updateExtension(llvm::StringRef NewExtension); + + /// Generate a temporary filename and open an fd + void generateFilename(llvm::OwningPtr<int> &fd); + + /// Creates DWARF CU/Subroutine metadata + void createDebugInfo(llvm::Module &M, + llvm::OwningPtr<llvm::Module> &DisplayM); + + /// Returns true if either Directory or Filename is missing, false otherwise. + bool isMissingPath(); + + /// Write M to disk, optionally passing in an fd to an open file which is + /// closed by this function after writing. If no fd is specified, a new file + /// is opened, written, and closed. + void writeDebugBitcode(const llvm::Module *M, int *fd = 0); +}; + +} // llvm namespace + +#endif // LLVM_TRANSFORMS_INSTRUMENTATION_DEBUGIR_H diff --git a/contrib/llvm/lib/Transforms/Instrumentation/EdgeProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/EdgeProfiling.cpp deleted file mode 100644 index a2459fb..0000000 --- a/contrib/llvm/lib/Transforms/Instrumentation/EdgeProfiling.cpp +++ /dev/null @@ -1,117 +0,0 @@ -//===- EdgeProfiling.cpp - Insert counters for edge profiling -------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass instruments the specified program with counters for edge profiling. -// Edge profiling can give a reasonable approximation of the hot paths through a -// program, and is used for a wide variety of program transformations. -// -// Note that this implementation is very naive. We insert a counter for *every* -// edge in the program, instead of using control flow information to prune the -// number of counters inserted. -// -//===----------------------------------------------------------------------===// -#define DEBUG_TYPE "insert-edge-profiling" - -#include "llvm/Transforms/Instrumentation.h" -#include "ProfilingUtils.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include <set> -using namespace llvm; - -STATISTIC(NumEdgesInserted, "The # of edges inserted."); - -namespace { - class EdgeProfiler : public ModulePass { - bool runOnModule(Module &M); - public: - static char ID; // Pass identification, replacement for typeid - EdgeProfiler() : ModulePass(ID) { - initializeEdgeProfilerPass(*PassRegistry::getPassRegistry()); - } - - virtual const char *getPassName() const { - return "Edge Profiler"; - } - }; -} - -char EdgeProfiler::ID = 0; -INITIALIZE_PASS(EdgeProfiler, "insert-edge-profiling", - "Insert instrumentation for edge profiling", false, false) - -ModulePass *llvm::createEdgeProfilerPass() { return new EdgeProfiler(); } - -bool EdgeProfiler::runOnModule(Module &M) { - Function *Main = M.getFunction("main"); - if (Main == 0) { - errs() << "WARNING: cannot insert edge profiling into a module" - << " with no main function!\n"; - return false; // No main, no instrumentation! - } - - std::set<BasicBlock*> BlocksToInstrument; - unsigned NumEdges = 0; - for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { - if (F->isDeclaration()) continue; - // Reserve space for (0,entry) edge. - ++NumEdges; - for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { - // Keep track of which blocks need to be instrumented. We don't want to - // instrument blocks that are added as the result of breaking critical - // edges! - BlocksToInstrument.insert(BB); - NumEdges += BB->getTerminator()->getNumSuccessors(); - } - } - - Type *ATy = ArrayType::get(Type::getInt32Ty(M.getContext()), NumEdges); - GlobalVariable *Counters = - new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage, - Constant::getNullValue(ATy), "EdgeProfCounters"); - NumEdgesInserted = NumEdges; - - // Instrument all of the edges... - unsigned i = 0; - for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { - if (F->isDeclaration()) continue; - // Create counter for (0,entry) edge. - IncrementCounterInBlock(&F->getEntryBlock(), i++, Counters); - for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) - if (BlocksToInstrument.count(BB)) { // Don't instrument inserted blocks - // Okay, we have to add a counter of each outgoing edge. If the - // outgoing edge is not critical don't split it, just insert the counter - // in the source or destination of the edge. - TerminatorInst *TI = BB->getTerminator(); - for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) { - // If the edge is critical, split it. - SplitCriticalEdge(TI, s, this); - - // Okay, we are guaranteed that the edge is no longer critical. If we - // only have a single successor, insert the counter in this block, - // otherwise insert it in the successor block. - if (TI->getNumSuccessors() == 1) { - // Insert counter at the start of the block - IncrementCounterInBlock(BB, i++, Counters, false); - } else { - // Insert counter at the start of the block - IncrementCounterInBlock(TI->getSuccessor(s), i++, Counters); - } - } - } - } - - // Add the initialization call to main. - InsertProfilingInitCall(Main, "llvm_start_edge_profiling", Counters); - return true; -} - diff --git a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index 2edd151..206bffb 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -17,7 +17,6 @@ #define DEBUG_TYPE "insert-gcov-profiling" #include "llvm/Transforms/Instrumentation.h" -#include "ProfilingUtils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" @@ -34,9 +33,10 @@ #include "llvm/Support/DebugLoc.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/InstIterator.h" -#include "llvm/Support/PathV2.h" +#include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include <algorithm> #include <string> #include <utility> using namespace llvm; @@ -102,6 +102,7 @@ namespace { Constant *getIncrementIndirectCounterFunc(); Constant *getEmitFunctionFunc(); Constant *getEmitArcsFunc(); + Constant *getSummaryInfoFunc(); Constant *getDeleteWriteoutFunctionListFunc(); Constant *getDeleteFlushFunctionListFunc(); Constant *getEndFileFunc(); @@ -153,10 +154,10 @@ static std::string getFunctionName(DISubprogram SP) { namespace { class GCOVRecord { protected: - static const char *LinesTag; - static const char *FunctionTag; - static const char *BlockTag; - static const char *EdgeTag; + static const char *const LinesTag; + static const char *const FunctionTag; + static const char *const BlockTag; + static const char *const EdgeTag; GCOVRecord() {} @@ -170,7 +171,7 @@ namespace { // Returns the length measured in 4-byte blocks that will be used to // represent this string in a GCOV file - unsigned lengthOfGCOVString(StringRef s) { + static unsigned lengthOfGCOVString(StringRef s) { // A GCOV string is a length, followed by a NUL, then between 0 and 3 NULs // padding out to the next 4-byte word. The length is measured in 4-byte // words including padding, not bytes of actual string. @@ -190,10 +191,10 @@ namespace { raw_ostream *os; }; - const char *GCOVRecord::LinesTag = "\0\0\x45\x01"; - const char *GCOVRecord::FunctionTag = "\0\0\0\1"; - const char *GCOVRecord::BlockTag = "\0\0\x41\x01"; - const char *GCOVRecord::EdgeTag = "\0\0\x43\x01"; + const char *const GCOVRecord::LinesTag = "\0\0\x45\x01"; + const char *const GCOVRecord::FunctionTag = "\0\0\0\1"; + const char *const GCOVRecord::BlockTag = "\0\0\x41\x01"; + const char *const GCOVRecord::EdgeTag = "\0\0\x43\x01"; class GCOVFunction; class GCOVBlock; @@ -207,7 +208,7 @@ namespace { Lines.push_back(Line); } - uint32_t length() { + uint32_t length() const { // Here 2 = 1 for string length + 1 for '0' id#. return lengthOfGCOVString(Filename) + 2 + Lines.size(); } @@ -229,6 +230,15 @@ namespace { SmallVector<uint32_t, 32> Lines; }; + + // Sorting function for deterministic behaviour in GCOVBlock::writeOut. + struct StringKeySort { + bool operator()(StringMapEntry<GCOVLines *> *LHS, + StringMapEntry<GCOVLines *> *RHS) const { + return LHS->getKey() < RHS->getKey(); + } + }; + // Represent a basic block in GCOV. Each block has a unique number in the // function, number of lines belonging to each block, and a set of edges to // other blocks. @@ -248,17 +258,23 @@ namespace { void writeOut() { uint32_t Len = 3; + SmallVector<StringMapEntry<GCOVLines *> *, 32> SortedLinesByFile; for (StringMap<GCOVLines *>::iterator I = LinesByFile.begin(), E = LinesByFile.end(); I != E; ++I) { Len += I->second->length(); + SortedLinesByFile.push_back(&*I); } writeBytes(LinesTag, 4); write(Len); write(Number); - for (StringMap<GCOVLines *>::iterator I = LinesByFile.begin(), - E = LinesByFile.end(); I != E; ++I) - I->second->writeOut(); + + StringKeySort Sorter; + std::sort(SortedLinesByFile.begin(), SortedLinesByFile.end(), Sorter); + for (SmallVectorImpl<StringMapEntry<GCOVLines *> *>::iterator + I = SortedLinesByFile.begin(), E = SortedLinesByFile.end(); + I != E; ++I) + (*I)->getValue()->writeOut(); write(0); write(0); } @@ -335,9 +351,10 @@ namespace { DEBUG(dbgs() << Blocks.size() << " blocks.\n"); // Emit edges between blocks. - for (DenseMap<BasicBlock *, GCOVBlock *>::iterator I = Blocks.begin(), - E = Blocks.end(); I != E; ++I) { - GCOVBlock &Block = *I->second; + if (Blocks.empty()) return; + Function *F = Blocks.begin()->first->getParent(); + for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { + GCOVBlock &Block = *Blocks[I]; if (Block.OutEdges.empty()) continue; writeBytes(EdgeTag, 4); @@ -352,9 +369,8 @@ namespace { } // Emit lines for each block. - for (DenseMap<BasicBlock *, GCOVBlock *>::iterator I = Blocks.begin(), - E = Blocks.end(); I != E; ++I) { - I->second->writeOut(); + for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { + Blocks[I]->writeOut(); } } @@ -410,7 +426,7 @@ void GCOVProfiler::emitProfileNotes() { DICompileUnit CU(CU_Nodes->getOperand(i)); std::string ErrorInfo; raw_fd_ostream out(mangleName(CU, "gcno").c_str(), ErrorInfo, - raw_fd_ostream::F_Binary); + sys::fs::F_Binary); out.write("oncg", 4); out.write(ReversedVersion, 4); out.write("MVLL", 4); @@ -418,7 +434,10 @@ void GCOVProfiler::emitProfileNotes() { DIArray SPs = CU.getSubprograms(); for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) { DISubprogram SP(SPs.getElement(i)); - if (!SP.Verify()) continue; + assert((!SP || SP.isSubprogram()) && + "A MDNode in subprograms of a CU should be null or a DISubprogram."); + if (!SP) + continue; Function *F = SP.getFunction(); if (!F) continue; @@ -467,7 +486,10 @@ bool GCOVProfiler::emitProfileArcs() { SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP; for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) { DISubprogram SP(SPs.getElement(i)); - if (!SP.Verify()) continue; + assert((!SP || SP.isSubprogram()) && + "A MDNode in subprograms of a CU should be null or a DISubprogram."); + if (!SP) + continue; Function *F = SP.getFunction(); if (!F) continue; if (!Result) Result = true; @@ -497,15 +519,15 @@ bool GCOVProfiler::emitProfileArcs() { TerminatorInst *TI = BB->getTerminator(); int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors(); if (Successors) { - IRBuilder<> Builder(TI); - if (Successors == 1) { + IRBuilder<> Builder(BB->getFirstInsertionPt()); Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Edge); Value *Count = Builder.CreateLoad(Counter); Count = Builder.CreateAdd(Count, Builder.getInt64(1)); Builder.CreateStore(Count, Counter); } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + IRBuilder<> Builder(BI); Value *Sel = Builder.CreateSelect(BI->getCondition(), Builder.getInt64(Edge), Builder.getInt64(Edge + 1)); @@ -521,6 +543,7 @@ bool GCOVProfiler::emitProfileArcs() { for (int i = 0; i != Successors; ++i) ComplexEdgeSuccs.insert(TI->getSuccessor(i)); } + Edge += Successors; } } @@ -532,14 +555,13 @@ bool GCOVProfiler::emitProfileArcs() { GlobalVariable *EdgeState = getEdgeStateValue(); for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) { - IRBuilder<> Builder(ComplexEdgePreds[i+1]->getTerminator()); + IRBuilder<> Builder(ComplexEdgePreds[i + 1]->getFirstInsertionPt()); Builder.CreateStore(Builder.getInt32(i), EdgeState); } + for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) { - // call runtime to perform increment - BasicBlock::iterator InsertPt = - ComplexEdgeSuccs[i+1]->getFirstInsertionPt(); - IRBuilder<> Builder(InsertPt); + // Call runtime to perform increment. + IRBuilder<> Builder(ComplexEdgeSuccs[i+1]->getFirstInsertionPt()); Value *CounterPtrArray = Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0, i * ComplexEdgePreds.size()); @@ -577,7 +599,7 @@ bool GCOVProfiler::emitProfileArcs() { }; FTy = FunctionType::get(Builder.getVoidTy(), Params, false); - // Inialize the environment and register the local writeout and flush + // Initialize the environment and register the local writeout and flush // functions. Constant *GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy); Builder.CreateCall2(GCOVInit, WriteoutF, FlushF); @@ -679,6 +701,11 @@ Constant *GCOVProfiler::getEmitArcsFunc() { return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy); } +Constant *GCOVProfiler::getSummaryInfoFunc() { + FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false); + return M->getOrInsertFunction("llvm_gcda_summary_info", FTy); +} + Constant *GCOVProfiler::getDeleteWriteoutFunctionListFunc() { FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false); return M->getOrInsertFunction("llvm_delete_writeout_function_list", FTy); @@ -725,6 +752,7 @@ Function *GCOVProfiler::insertCounterWriteout( Constant *StartFile = getStartFileFunc(); Constant *EmitFunction = getEmitFunctionFunc(); Constant *EmitArcs = getEmitArcsFunc(); + Constant *SummaryInfo = getSummaryInfoFunc(); Constant *EndFile = getEndFileFunc(); NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu"); @@ -751,6 +779,7 @@ Function *GCOVProfiler::insertCounterWriteout( Builder.getInt32(Arcs), Builder.CreateConstGEP2_64(GV, 0, 0)); } + Builder.CreateCall(SummaryInfo); Builder.CreateCall(EndFile); } } diff --git a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp index 9f35396..b1bea38 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -24,12 +24,10 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) { initializeAddressSanitizerPass(Registry); initializeAddressSanitizerModulePass(Registry); initializeBoundsCheckingPass(Registry); - initializeEdgeProfilerPass(Registry); initializeGCOVProfilerPass(Registry); - initializeOptimalEdgeProfilerPass(Registry); - initializePathProfilerPass(Registry); initializeMemorySanitizerPass(Registry); initializeThreadSanitizerPass(Registry); + initializeDataFlowSanitizerPass(Registry); } /// LLVMInitializeInstrumentation - C binding for diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 4e75904..d547adc 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -66,6 +66,31 @@ /// avoids storing origin to memory when a fully initialized value is stored. /// This way it avoids needless overwritting origin of the 4-byte region on /// a short (i.e. 1 byte) clean store, and it is also good for performance. +/// +/// Atomic handling. +/// +/// Ideally, every atomic store of application value should update the +/// corresponding shadow location in an atomic way. Unfortunately, atomic store +/// of two disjoint locations can not be done without severe slowdown. +/// +/// Therefore, we implement an approximation that may err on the safe side. +/// In this implementation, every atomically accessed location in the program +/// may only change from (partially) uninitialized to fully initialized, but +/// not the other way around. We load the shadow _after_ the application load, +/// and we store the shadow _before_ the app store. Also, we always store clean +/// shadow (if the application store is atomic). This way, if the store-load +/// pair constitutes a happens-before arc, shadow store and load are correctly +/// ordered such that the load will get either the value that was stored, or +/// some later value (which is always clean). +/// +/// This does not work very well with Compare-And-Swap (CAS) and +/// Read-Modify-Write (RMW) operations. To follow the above logic, CAS and RMW +/// must store the new shadow before the app operation, and load the shadow +/// after the app operation. Computers don't work this way. Current +/// implementation ignores the load aspect of CAS/RMW, always returning a clean +/// value. It implements the store part as a simple atomic store by storing a +/// clean shadow. + //===----------------------------------------------------------------------===// #define DEBUG_TYPE "msan" @@ -74,6 +99,7 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Triple.h" #include "llvm/ADT/ValueMap.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" @@ -90,9 +116,9 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/BlackList.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Transforms/Utils/SpecialCaseList.h" using namespace llvm; @@ -156,6 +182,18 @@ static cl::opt<std::string> ClBlacklistFile("msan-blacklist", cl::desc("File containing the list of functions where MemorySanitizer " "should not report bugs"), cl::Hidden); +// Experimental. Wraps all indirect calls in the instrumented code with +// a call to the given function. This is needed to assist the dynamic +// helper tool (MSanDR) to regain control on transition between instrumented and +// non-instrumented code. +static cl::opt<std::string> ClWrapIndirectCalls("msan-wrap-indirect-calls", + cl::desc("Wrap indirect calls with a given function"), + cl::Hidden); + +static cl::opt<bool> ClWrapIndirectCallsFast("msan-wrap-indirect-calls-fast", + cl::desc("Do not wrap indirect calls with target in the same module"), + cl::Hidden, cl::init(true)); + namespace { /// \brief An instrumentation pass implementing detection of uninitialized @@ -167,12 +205,12 @@ class MemorySanitizer : public FunctionPass { public: MemorySanitizer(bool TrackOrigins = false, StringRef BlacklistFile = StringRef()) - : FunctionPass(ID), - TrackOrigins(TrackOrigins || ClTrackOrigins), - TD(0), - WarningFn(0), - BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile - : BlacklistFile) { } + : FunctionPass(ID), + TrackOrigins(TrackOrigins || ClTrackOrigins), + TD(0), + WarningFn(0), + BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile : BlacklistFile), + WrapIndirectCalls(!ClWrapIndirectCalls.empty()) {} const char *getPassName() const { return "MemorySanitizer"; } bool runOnFunction(Function &F); bool doInitialization(Module &M); @@ -206,13 +244,16 @@ class MemorySanitizer : public FunctionPass { /// function. GlobalVariable *OriginTLS; + GlobalVariable *MsandrModuleStart; + GlobalVariable *MsandrModuleEnd; + /// \brief The run-time callback to print a warning. Value *WarningFn; /// \brief Run-time helper that copies origin info for a memory range. Value *MsanCopyOriginFn; /// \brief Run-time helper that generates a new origin value for a stack /// allocation. - Value *MsanSetAllocaOriginFn; + Value *MsanSetAllocaOrigin4Fn; /// \brief Run-time helper that poisons stack on function entry. Value *MsanPoisonStackFn; /// \brief MSan runtime replacements for memmove, memcpy and memset. @@ -228,13 +269,19 @@ class MemorySanitizer : public FunctionPass { MDNode *ColdCallWeights; /// \brief Branch weights for origin store. MDNode *OriginStoreWeights; - /// \bried Path to blacklist file. + /// \brief Path to blacklist file. SmallString<64> BlacklistFile; /// \brief The blacklist. - OwningPtr<BlackList> BL; + OwningPtr<SpecialCaseList> BL; /// \brief An empty volatile inline asm that prevents callback merge. InlineAsm *EmptyAsm; + bool WrapIndirectCalls; + /// \brief Run-time wrapper for indirect calls. + Value *IndirectCallWrapperFn; + // Argument and return type of IndirectCallWrapperFn: void (*f)(void). + Type *AnyFunctionPtrTy; + friend struct MemorySanitizerVisitor; friend struct VarArgAMD64Helper; }; @@ -280,9 +327,9 @@ void MemorySanitizer::initializeCallbacks(Module &M) { MsanCopyOriginFn = M.getOrInsertFunction( "__msan_copy_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, NULL); - MsanSetAllocaOriginFn = M.getOrInsertFunction( - "__msan_set_alloca_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, - IRB.getInt8PtrTy(), NULL); + MsanSetAllocaOrigin4Fn = M.getOrInsertFunction( + "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, + IRB.getInt8PtrTy(), IntptrTy, NULL); MsanPoisonStackFn = M.getOrInsertFunction( "__msan_poison_stack", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, NULL); MemmoveFn = M.getOrInsertFunction( @@ -299,35 +346,53 @@ void MemorySanitizer::initializeCallbacks(Module &M) { RetvalTLS = new GlobalVariable( M, ArrayType::get(IRB.getInt64Ty(), 8), false, GlobalVariable::ExternalLinkage, 0, "__msan_retval_tls", 0, - GlobalVariable::GeneralDynamicTLSModel); + GlobalVariable::InitialExecTLSModel); RetvalOriginTLS = new GlobalVariable( M, OriginTy, false, GlobalVariable::ExternalLinkage, 0, - "__msan_retval_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel); + "__msan_retval_origin_tls", 0, GlobalVariable::InitialExecTLSModel); ParamTLS = new GlobalVariable( M, ArrayType::get(IRB.getInt64Ty(), 1000), false, GlobalVariable::ExternalLinkage, 0, "__msan_param_tls", 0, - GlobalVariable::GeneralDynamicTLSModel); + GlobalVariable::InitialExecTLSModel); ParamOriginTLS = new GlobalVariable( M, ArrayType::get(OriginTy, 1000), false, GlobalVariable::ExternalLinkage, - 0, "__msan_param_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel); + 0, "__msan_param_origin_tls", 0, GlobalVariable::InitialExecTLSModel); VAArgTLS = new GlobalVariable( M, ArrayType::get(IRB.getInt64Ty(), 1000), false, GlobalVariable::ExternalLinkage, 0, "__msan_va_arg_tls", 0, - GlobalVariable::GeneralDynamicTLSModel); + GlobalVariable::InitialExecTLSModel); VAArgOverflowSizeTLS = new GlobalVariable( M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, 0, "__msan_va_arg_overflow_size_tls", 0, - GlobalVariable::GeneralDynamicTLSModel); + GlobalVariable::InitialExecTLSModel); OriginTLS = new GlobalVariable( M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, 0, - "__msan_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel); + "__msan_origin_tls", 0, GlobalVariable::InitialExecTLSModel); // We insert an empty inline asm after __msan_report* to avoid callback merge. EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), StringRef(""), StringRef(""), /*hasSideEffects=*/true); + + if (WrapIndirectCalls) { + AnyFunctionPtrTy = + PointerType::getUnqual(FunctionType::get(IRB.getVoidTy(), false)); + IndirectCallWrapperFn = M.getOrInsertFunction( + ClWrapIndirectCalls, AnyFunctionPtrTy, AnyFunctionPtrTy, NULL); + } + + if (ClWrapIndirectCallsFast) { + MsandrModuleStart = new GlobalVariable( + M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage, + 0, "__executable_start"); + MsandrModuleStart->setVisibility(GlobalVariable::HiddenVisibility); + MsandrModuleEnd = new GlobalVariable( + M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage, + 0, "_end"); + MsandrModuleEnd->setVisibility(GlobalVariable::HiddenVisibility); + } } /// \brief Module-level initialization. @@ -337,7 +402,7 @@ bool MemorySanitizer::doInitialization(Module &M) { TD = getAnalysisIfAvailable<DataLayout>(); if (!TD) return false; - BL.reset(new BlackList(BlacklistFile)); + BL.reset(SpecialCaseList::createOrDie(BlacklistFile)); C = &(M.getContext()); unsigned PtrSize = TD->getPointerSizeInBits(/* AddressSpace */0); switch (PtrSize) { @@ -365,11 +430,13 @@ bool MemorySanitizer::doInitialization(Module &M) { appendToGlobalCtors(M, cast<Function>(M.getOrInsertFunction( "__msan_init", IRB.getVoidTy(), NULL)), 0); - new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage, - IRB.getInt32(TrackOrigins), "__msan_track_origins"); + if (TrackOrigins) + new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage, + IRB.getInt32(TrackOrigins), "__msan_track_origins"); - new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage, - IRB.getInt32(ClKeepGoing), "__msan_keep_going"); + if (ClKeepGoing) + new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage, + IRB.getInt32(ClKeepGoing), "__msan_keep_going"); return true; } @@ -420,27 +487,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { MemorySanitizer &MS; SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes; ValueMap<Value*, Value*> ShadowMap, OriginMap; + OwningPtr<VarArgHelper> VAHelper; + + // The following flags disable parts of MSan instrumentation based on + // blacklist contents and command-line options. bool InsertChecks; bool LoadShadow; - OwningPtr<VarArgHelper> VAHelper; + bool PoisonStack; + bool PoisonUndef; + bool CheckReturnValue; struct ShadowOriginAndInsertPoint { - Instruction *Shadow; - Instruction *Origin; + Value *Shadow; + Value *Origin; Instruction *OrigIns; - ShadowOriginAndInsertPoint(Instruction *S, Instruction *O, Instruction *I) + ShadowOriginAndInsertPoint(Value *S, Value *O, Instruction *I) : Shadow(S), Origin(O), OrigIns(I) { } ShadowOriginAndInsertPoint() : Shadow(0), Origin(0), OrigIns(0) { } }; SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList; SmallVector<Instruction*, 16> StoreList; + SmallVector<CallSite, 16> IndirectCallList; MemorySanitizerVisitor(Function &F, MemorySanitizer &MS) : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) { - LoadShadow = InsertChecks = - !MS.BL->isIn(F) && - F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::SanitizeMemory); + bool SanitizeFunction = !MS.BL->isIn(F) && F.getAttributes().hasAttribute( + AttributeSet::FunctionIndex, + Attribute::SanitizeMemory); + InsertChecks = SanitizeFunction; + LoadShadow = SanitizeFunction; + PoisonStack = SanitizeFunction && ClPoisonStack; + PoisonUndef = SanitizeFunction && ClPoisonUndef; + // FIXME: Consider using SpecialCaseList to specify a list of functions that + // must always return fully initialized values. For now, we hardcode "main". + CheckReturnValue = SanitizeFunction && (F.getName() == "main"); DEBUG(if (!InsertChecks) dbgs() << "MemorySanitizer is not inserting checks into '" @@ -454,7 +534,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRB(&I); Value *Val = I.getValueOperand(); Value *Addr = I.getPointerOperand(); - Value *Shadow = getShadow(Val); + Value *Shadow = I.isAtomic() ? getCleanShadow(Val) : getShadow(Val); Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB); StoreInst *NewSI = @@ -463,7 +543,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { (void)NewSI; if (ClCheckAccessAddress) - insertCheck(Addr, &I); + insertShadowCheck(Addr, &I); + + if (I.isAtomic()) + I.setOrdering(addReleaseOrdering(I.getOrdering())); if (MS.TrackOrigins) { unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment()); @@ -473,11 +556,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } else { Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); - Constant *Cst = dyn_cast_or_null<Constant>(ConvertedShadow); // TODO(eugenis): handle non-zero constant shadow by inserting an // unconditional check (can not simply fail compilation as this could // be in the dead code). - if (Cst) + if (isa<Constant>(ConvertedShadow)) continue; Value *Cmp = IRB.CreateICmpNE(ConvertedShadow, @@ -495,12 +577,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void materializeChecks() { for (size_t i = 0, n = InstrumentationList.size(); i < n; i++) { - Instruction *Shadow = InstrumentationList[i].Shadow; + Value *Shadow = InstrumentationList[i].Shadow; Instruction *OrigIns = InstrumentationList[i].OrigIns; IRBuilder<> IRB(OrigIns); DEBUG(dbgs() << " SHAD0 : " << *Shadow << "\n"); Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); DEBUG(dbgs() << " SHAD1 : " << *ConvertedShadow << "\n"); + // See the comment in materializeStores(). + if (isa<Constant>(ConvertedShadow)) + continue; Value *Cmp = IRB.CreateICmpNE(ConvertedShadow, getCleanShadow(ConvertedShadow), "_mscmp"); Instruction *CheckTerm = @@ -510,7 +595,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRB.SetInsertPoint(CheckTerm); if (MS.TrackOrigins) { - Instruction *Origin = InstrumentationList[i].Origin; + Value *Origin = InstrumentationList[i].Origin; IRB.CreateStore(Origin ? (Value*)Origin : (Value*)IRB.getInt32(0), MS.OriginTLS); } @@ -522,6 +607,48 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { DEBUG(dbgs() << "DONE:\n" << F); } + void materializeIndirectCalls() { + for (size_t i = 0, n = IndirectCallList.size(); i < n; i++) { + CallSite CS = IndirectCallList[i]; + Instruction *I = CS.getInstruction(); + BasicBlock *B = I->getParent(); + IRBuilder<> IRB(I); + Value *Fn0 = CS.getCalledValue(); + Value *Fn = IRB.CreateBitCast(Fn0, MS.AnyFunctionPtrTy); + + if (ClWrapIndirectCallsFast) { + // Check that call target is inside this module limits. + Value *Start = + IRB.CreateBitCast(MS.MsandrModuleStart, MS.AnyFunctionPtrTy); + Value *End = IRB.CreateBitCast(MS.MsandrModuleEnd, MS.AnyFunctionPtrTy); + + Value *NotInThisModule = IRB.CreateOr(IRB.CreateICmpULT(Fn, Start), + IRB.CreateICmpUGE(Fn, End)); + + PHINode *NewFnPhi = + IRB.CreatePHI(Fn0->getType(), 2, "msandr.indirect_target"); + + Instruction *CheckTerm = SplitBlockAndInsertIfThen( + cast<Instruction>(NotInThisModule), + /* Unreachable */ false, MS.ColdCallWeights); + + IRB.SetInsertPoint(CheckTerm); + // Slow path: call wrapper function to possibly transform the call + // target. + Value *NewFn = IRB.CreateBitCast( + IRB.CreateCall(MS.IndirectCallWrapperFn, Fn), Fn0->getType()); + + NewFnPhi->addIncoming(Fn0, B); + NewFnPhi->addIncoming(NewFn, dyn_cast<Instruction>(NewFn)->getParent()); + CS.setCalledFunction(NewFnPhi); + } else { + Value *NewFn = IRB.CreateBitCast( + IRB.CreateCall(MS.IndirectCallWrapperFn, Fn), Fn0->getType()); + CS.setCalledFunction(NewFn); + } + } + } + /// \brief Add MemorySanitizer instrumentation to a function. bool runOnFunction() { MS.initializeCallbacks(*F.getParent()); @@ -564,6 +691,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // Insert shadow value checks. materializeChecks(); + // Wrap indirect calls. + materializeIndirectCalls(); + return true; } @@ -741,7 +871,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { return Shadow; } if (UndefValue *U = dyn_cast<UndefValue>(V)) { - Value *AllOnes = ClPoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V); + Value *AllOnes = PoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V); DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n"); (void)U; return AllOnes; @@ -768,14 +898,21 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (AI->hasByValAttr()) { // ByVal pointer itself has clean shadow. We copy the actual // argument shadow to the underlying memory. + // Figure out maximal valid memcpy alignment. + unsigned ArgAlign = AI->getParamAlignment(); + if (ArgAlign == 0) { + Type *EltType = A->getType()->getPointerElementType(); + ArgAlign = MS.TD->getABITypeAlignment(EltType); + } + unsigned CopyAlign = std::min(ArgAlign, kShadowTLSAlignment); Value *Cpy = EntryIRB.CreateMemCpy( - getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB), - Base, Size, AI->getParamAlignment()); + getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB), Base, Size, + CopyAlign); DEBUG(dbgs() << " ByValCpy: " << *Cpy << "\n"); (void)Cpy; *ShadowPtr = getCleanShadow(V); } else { - *ShadowPtr = EntryIRB.CreateLoad(Base); + *ShadowPtr = EntryIRB.CreateAlignedLoad(Base, kShadowTLSAlignment); } DEBUG(dbgs() << " ARG: " << *AI << " ==> " << **ShadowPtr << "\n"); @@ -784,7 +921,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setOrigin(A, EntryIRB.CreateLoad(OriginPtr)); } } - ArgOffset += DataLayout::RoundUpAlignment(Size, 8); + ArgOffset += DataLayout::RoundUpAlignment(Size, kShadowTLSAlignment); } assert(*ShadowPtr && "Could not find shadow for an argument"); return *ShadowPtr; @@ -820,20 +957,63 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// \brief Remember the place where a shadow check should be inserted. /// /// This location will be later instrumented with a check that will print a - /// UMR warning in runtime if the value is not fully defined. - void insertCheck(Value *Val, Instruction *OrigIns) { - assert(Val); + /// UMR warning in runtime if the shadow value is not 0. + void insertShadowCheck(Value *Shadow, Value *Origin, Instruction *OrigIns) { + assert(Shadow); if (!InsertChecks) return; - Instruction *Shadow = dyn_cast_or_null<Instruction>(getShadow(Val)); - if (!Shadow) return; #ifndef NDEBUG Type *ShadowTy = Shadow->getType(); assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy)) && "Can only insert checks for integer and vector shadow types"); #endif - Instruction *Origin = dyn_cast_or_null<Instruction>(getOrigin(Val)); InstrumentationList.push_back( - ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns)); + ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns)); + } + + /// \brief Remember the place where a shadow check should be inserted. + /// + /// This location will be later instrumented with a check that will print a + /// UMR warning in runtime if the value is not fully defined. + void insertShadowCheck(Value *Val, Instruction *OrigIns) { + assert(Val); + Instruction *Shadow = dyn_cast_or_null<Instruction>(getShadow(Val)); + if (!Shadow) return; + Instruction *Origin = dyn_cast_or_null<Instruction>(getOrigin(Val)); + insertShadowCheck(Shadow, Origin, OrigIns); + } + + AtomicOrdering addReleaseOrdering(AtomicOrdering a) { + switch (a) { + case NotAtomic: + return NotAtomic; + case Unordered: + case Monotonic: + case Release: + return Release; + case Acquire: + case AcquireRelease: + return AcquireRelease; + case SequentiallyConsistent: + return SequentiallyConsistent; + } + llvm_unreachable("Unknown ordering"); + } + + AtomicOrdering addAcquireOrdering(AtomicOrdering a) { + switch (a) { + case NotAtomic: + return NotAtomic; + case Unordered: + case Monotonic: + case Acquire: + return Acquire; + case Release: + case AcquireRelease: + return AcquireRelease; + case SequentiallyConsistent: + return SequentiallyConsistent; + } + llvm_unreachable("Unknown ordering"); } // ------------------- Visitors. @@ -844,7 +1024,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// Optionally, checks that the load address is fully defined. void visitLoadInst(LoadInst &I) { assert(I.getType()->isSized() && "Load type must have size"); - IRBuilder<> IRB(&I); + IRBuilder<> IRB(I.getNextNode()); Type *ShadowTy = getShadowTy(&I); Value *Addr = I.getPointerOperand(); if (LoadShadow) { @@ -856,7 +1036,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } if (ClCheckAccessAddress) - insertCheck(I.getPointerOperand(), &I); + insertShadowCheck(I.getPointerOperand(), &I); + + if (I.isAtomic()) + I.setOrdering(addAcquireOrdering(I.getOrdering())); if (MS.TrackOrigins) { if (LoadShadow) { @@ -877,9 +1060,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { StoreList.push_back(&I); } + void handleCASOrRMW(Instruction &I) { + assert(isa<AtomicRMWInst>(I) || isa<AtomicCmpXchgInst>(I)); + + IRBuilder<> IRB(&I); + Value *Addr = I.getOperand(0); + Value *ShadowPtr = getShadowPtr(Addr, I.getType(), IRB); + + if (ClCheckAccessAddress) + insertShadowCheck(Addr, &I); + + // Only test the conditional argument of cmpxchg instruction. + // The other argument can potentially be uninitialized, but we can not + // detect this situation reliably without possible false positives. + if (isa<AtomicCmpXchgInst>(I)) + insertShadowCheck(I.getOperand(1), &I); + + IRB.CreateStore(getCleanShadow(&I), ShadowPtr); + + setShadow(&I, getCleanShadow(&I)); + } + + void visitAtomicRMWInst(AtomicRMWInst &I) { + handleCASOrRMW(I); + I.setOrdering(addReleaseOrdering(I.getOrdering())); + } + + void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) { + handleCASOrRMW(I); + I.setOrdering(addReleaseOrdering(I.getOrdering())); + } + // Vector manipulation. void visitExtractElementInst(ExtractElementInst &I) { - insertCheck(I.getOperand(1), &I); + insertShadowCheck(I.getOperand(1), &I); IRBuilder<> IRB(&I); setShadow(&I, IRB.CreateExtractElement(getShadow(&I, 0), I.getOperand(1), "_msprop")); @@ -887,7 +1101,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } void visitInsertElementInst(InsertElementInst &I) { - insertCheck(I.getOperand(2), &I); + insertShadowCheck(I.getOperand(2), &I); IRBuilder<> IRB(&I); setShadow(&I, IRB.CreateInsertElement(getShadow(&I, 0), getShadow(&I, 1), I.getOperand(2), "_msprop")); @@ -895,7 +1109,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } void visitShuffleVectorInst(ShuffleVectorInst &I) { - insertCheck(I.getOperand(2), &I); + insertShadowCheck(I.getOperand(2), &I); IRBuilder<> IRB(&I); setShadow(&I, IRB.CreateShuffleVector(getShadow(&I, 0), getShadow(&I, 1), I.getOperand(2), "_msprop")); @@ -1094,18 +1308,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// \brief Cast between two shadow types, extending or truncating as /// necessary. - Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy) { + Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy, + bool Signed = false) { Type *srcTy = V->getType(); if (dstTy->isIntegerTy() && srcTy->isIntegerTy()) - return IRB.CreateIntCast(V, dstTy, false); + return IRB.CreateIntCast(V, dstTy, Signed); if (dstTy->isVectorTy() && srcTy->isVectorTy() && dstTy->getVectorNumElements() == srcTy->getVectorNumElements()) - return IRB.CreateIntCast(V, dstTy, false); + return IRB.CreateIntCast(V, dstTy, Signed); size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy); size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy); Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits)); Value *V2 = - IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), false); + IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed); return IRB.CreateBitCast(V2, dstTy); // TODO: handle struct types. } @@ -1130,7 +1345,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void handleDiv(Instruction &I) { IRBuilder<> IRB(&I); // Strict on the second argument. - insertCheck(I.getOperand(1), &I); + insertShadowCheck(I.getOperand(1), &I); setShadow(&I, getShadow(&I, 0)); setOrigin(&I, getOrigin(&I, 0)); } @@ -1413,7 +1628,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRB.CreateAlignedStore(Shadow, ShadowPtr, 1); if (ClCheckAccessAddress) - insertCheck(Addr, &I); + insertShadowCheck(Addr, &I); // FIXME: use ClStoreCleanOrigin // FIXME: factor out common code from materializeStores @@ -1440,9 +1655,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setShadow(&I, getCleanShadow(&I)); } - if (ClCheckAccessAddress) - insertCheck(Addr, &I); + insertShadowCheck(Addr, &I); if (MS.TrackOrigins) { if (LoadShadow) @@ -1539,11 +1753,119 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setOrigin(&I, getOrigin(Op)); } + // \brief Instrument vector convert instrinsic. + // + // This function instruments intrinsics like cvtsi2ss: + // %Out = int_xxx_cvtyyy(%ConvertOp) + // or + // %Out = int_xxx_cvtyyy(%CopyOp, %ConvertOp) + // Intrinsic converts \p NumUsedElements elements of \p ConvertOp to the same + // number \p Out elements, and (if has 2 arguments) copies the rest of the + // elements from \p CopyOp. + // In most cases conversion involves floating-point value which may trigger a + // hardware exception when not fully initialized. For this reason we require + // \p ConvertOp[0:NumUsedElements] to be fully initialized and trap otherwise. + // We copy the shadow of \p CopyOp[NumUsedElements:] to \p + // Out[NumUsedElements:]. This means that intrinsics without \p CopyOp always + // return a fully initialized value. + void handleVectorConvertIntrinsic(IntrinsicInst &I, int NumUsedElements) { + IRBuilder<> IRB(&I); + Value *CopyOp, *ConvertOp; + + switch (I.getNumArgOperands()) { + case 2: + CopyOp = I.getArgOperand(0); + ConvertOp = I.getArgOperand(1); + break; + case 1: + ConvertOp = I.getArgOperand(0); + CopyOp = NULL; + break; + default: + llvm_unreachable("Cvt intrinsic with unsupported number of arguments."); + } + + // The first *NumUsedElements* elements of ConvertOp are converted to the + // same number of output elements. The rest of the output is copied from + // CopyOp, or (if not available) filled with zeroes. + // Combine shadow for elements of ConvertOp that are used in this operation, + // and insert a check. + // FIXME: consider propagating shadow of ConvertOp, at least in the case of + // int->any conversion. + Value *ConvertShadow = getShadow(ConvertOp); + Value *AggShadow = 0; + if (ConvertOp->getType()->isVectorTy()) { + AggShadow = IRB.CreateExtractElement( + ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), 0)); + for (int i = 1; i < NumUsedElements; ++i) { + Value *MoreShadow = IRB.CreateExtractElement( + ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), i)); + AggShadow = IRB.CreateOr(AggShadow, MoreShadow); + } + } else { + AggShadow = ConvertShadow; + } + assert(AggShadow->getType()->isIntegerTy()); + insertShadowCheck(AggShadow, getOrigin(ConvertOp), &I); + + // Build result shadow by zero-filling parts of CopyOp shadow that come from + // ConvertOp. + if (CopyOp) { + assert(CopyOp->getType() == I.getType()); + assert(CopyOp->getType()->isVectorTy()); + Value *ResultShadow = getShadow(CopyOp); + Type *EltTy = ResultShadow->getType()->getVectorElementType(); + for (int i = 0; i < NumUsedElements; ++i) { + ResultShadow = IRB.CreateInsertElement( + ResultShadow, ConstantInt::getNullValue(EltTy), + ConstantInt::get(IRB.getInt32Ty(), i)); + } + setShadow(&I, ResultShadow); + setOrigin(&I, getOrigin(CopyOp)); + } else { + setShadow(&I, getCleanShadow(&I)); + } + } + void visitIntrinsicInst(IntrinsicInst &I) { switch (I.getIntrinsicID()) { case llvm::Intrinsic::bswap: handleBswap(I); break; + case llvm::Intrinsic::x86_avx512_cvtsd2usi64: + case llvm::Intrinsic::x86_avx512_cvtsd2usi: + case llvm::Intrinsic::x86_avx512_cvtss2usi64: + case llvm::Intrinsic::x86_avx512_cvtss2usi: + case llvm::Intrinsic::x86_avx512_cvttss2usi64: + case llvm::Intrinsic::x86_avx512_cvttss2usi: + case llvm::Intrinsic::x86_avx512_cvttsd2usi64: + case llvm::Intrinsic::x86_avx512_cvttsd2usi: + case llvm::Intrinsic::x86_avx512_cvtusi2sd: + case llvm::Intrinsic::x86_avx512_cvtusi2ss: + case llvm::Intrinsic::x86_avx512_cvtusi642sd: + case llvm::Intrinsic::x86_avx512_cvtusi642ss: + case llvm::Intrinsic::x86_sse2_cvtsd2si64: + case llvm::Intrinsic::x86_sse2_cvtsd2si: + case llvm::Intrinsic::x86_sse2_cvtsd2ss: + case llvm::Intrinsic::x86_sse2_cvtsi2sd: + case llvm::Intrinsic::x86_sse2_cvtsi642sd: + case llvm::Intrinsic::x86_sse2_cvtss2sd: + case llvm::Intrinsic::x86_sse2_cvttsd2si64: + case llvm::Intrinsic::x86_sse2_cvttsd2si: + case llvm::Intrinsic::x86_sse_cvtsi2ss: + case llvm::Intrinsic::x86_sse_cvtsi642ss: + case llvm::Intrinsic::x86_sse_cvtss2si64: + case llvm::Intrinsic::x86_sse_cvtss2si: + case llvm::Intrinsic::x86_sse_cvttss2si64: + case llvm::Intrinsic::x86_sse_cvttss2si: + handleVectorConvertIntrinsic(I, 1); + break; + case llvm::Intrinsic::x86_sse2_cvtdq2pd: + case llvm::Intrinsic::x86_sse2_cvtps2pd: + case llvm::Intrinsic::x86_sse_cvtps2pi: + case llvm::Intrinsic::x86_sse_cvttps2pi: + handleVectorConvertIntrinsic(I, 2); + break; default: if (!handleUnknownIntrinsic(I)) visitInstruction(I); @@ -1589,6 +1911,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } } IRBuilder<> IRB(&I); + + if (MS.WrapIndirectCalls && !CS.getCalledFunction()) + IndirectCallList.push_back(CS); + unsigned ArgOffset = 0; DEBUG(dbgs() << " CallSite: " << I << "\n"); for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end(); @@ -1632,7 +1958,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { DEBUG(dbgs() << " done with call args\n"); FunctionType *FT = - cast<FunctionType>(CS.getCalledValue()->getType()-> getContainedType(0)); + cast<FunctionType>(CS.getCalledValue()->getType()->getContainedType(0)); if (FT->isVarArg()) { VAHelper->visitCallSite(CS, IRB); } @@ -1671,12 +1997,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void visitReturnInst(ReturnInst &I) { IRBuilder<> IRB(&I); - if (Value *RetVal = I.getReturnValue()) { - // Set the shadow for the RetVal. + Value *RetVal = I.getReturnValue(); + if (!RetVal) return; + Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB); + if (CheckReturnValue) { + insertShadowCheck(RetVal, &I); + Value *Shadow = getCleanShadow(RetVal); + IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment); + } else { Value *Shadow = getShadow(RetVal); - Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB); - DEBUG(dbgs() << "Return: " << *Shadow << "\n" << *ShadowPtr << "\n"); IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment); + // FIXME: make it conditional if ClStoreCleanOrigin==0 if (MS.TrackOrigins) IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB)); } @@ -1694,20 +2025,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void visitAllocaInst(AllocaInst &I) { setShadow(&I, getCleanShadow(&I)); - if (!ClPoisonStack) return; IRBuilder<> IRB(I.getNextNode()); uint64_t Size = MS.TD->getTypeAllocSize(I.getAllocatedType()); - if (ClPoisonStackWithCall) { + if (PoisonStack && ClPoisonStackWithCall) { IRB.CreateCall2(MS.MsanPoisonStackFn, IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), ConstantInt::get(MS.IntptrTy, Size)); } else { Value *ShadowBase = getShadowPtr(&I, Type::getInt8PtrTy(*MS.C), IRB); - IRB.CreateMemSet(ShadowBase, IRB.getInt8(ClPoisonStackPattern), - Size, I.getAlignment()); + Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0); + IRB.CreateMemSet(ShadowBase, PoisonValue, Size, I.getAlignment()); } - if (MS.TrackOrigins) { + if (PoisonStack && MS.TrackOrigins) { setOrigin(&I, getCleanOrigin()); SmallString<2048> StackDescriptionStorage; raw_svector_ostream StackDescription(StackDescriptionStorage); @@ -1720,18 +2050,34 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *Descr = createPrivateNonConstGlobalForString(*F.getParent(), StackDescription.str()); - IRB.CreateCall3(MS.MsanSetAllocaOriginFn, + + IRB.CreateCall4(MS.MsanSetAllocaOrigin4Fn, IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), ConstantInt::get(MS.IntptrTy, Size), - IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy())); + IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()), + IRB.CreatePointerCast(&F, MS.IntptrTy)); } } void visitSelectInst(SelectInst& I) { IRBuilder<> IRB(&I); - setShadow(&I, IRB.CreateSelect(I.getCondition(), - getShadow(I.getTrueValue()), getShadow(I.getFalseValue()), - "_msprop")); + // a = select b, c, d + Value *S = IRB.CreateSelect(I.getCondition(), getShadow(I.getTrueValue()), + getShadow(I.getFalseValue())); + if (I.getType()->isAggregateType()) { + // To avoid "sign extending" i1 to an arbitrary aggregate type, we just do + // an extra "select". This results in much more compact IR. + // Sa = select Sb, poisoned, (select b, Sc, Sd) + S = IRB.CreateSelect(getShadow(I.getCondition()), + getPoisonedShadow(getShadowTy(I.getType())), S, + "_msprop_select_agg"); + } else { + // Sa = (sext Sb) | (select b, Sc, Sd) + S = IRB.CreateOr(S, CreateShadowCast(IRB, getShadow(I.getCondition()), + S->getType(), true), + "_msprop_select"); + } + setShadow(&I, S); if (MS.TrackOrigins) { // Origins are always i32, so any vector conditions must be flattened. // FIXME: consider tracking vector origins for app vectors? @@ -1766,7 +2112,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices()); DEBUG(dbgs() << " ResShadow: " << *ResShadow << "\n"); setShadow(&I, ResShadow); - setOrigin(&I, getCleanOrigin()); + setOriginForNaryOp(I); } void visitInsertValueInst(InsertValueInst &I) { @@ -1779,7 +2125,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices()); DEBUG(dbgs() << " Res: " << *Res << "\n"); setShadow(&I, Res); - setOrigin(&I, getCleanOrigin()); + setOriginForNaryOp(I); } void dumpInst(Instruction &I) { @@ -1802,7 +2148,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { dumpInst(I); DEBUG(dbgs() << "DEFAULT: " << I << "\n"); for (size_t i = 0, n = I.getNumOperands(); i < n; i++) - insertCheck(I.getOperand(i), &I); + insertShadowCheck(I.getOperand(i), &I); setShadow(&I, getCleanShadow(&I)); setOrigin(&I, getCleanOrigin()); } @@ -1956,16 +2302,35 @@ struct VarArgAMD64Helper : public VarArgHelper { Value *OverflowArgAreaPtr = IRB.CreateLoad(OverflowArgAreaPtrPtr); Value *OverflowArgAreaShadowPtr = MSV.getShadowPtr(OverflowArgAreaPtr, IRB.getInt8Ty(), IRB); - Value *SrcPtr = - getShadowPtrForVAArgument(VAArgTLSCopy, IRB, AMD64FpEndOffset); + Value *SrcPtr = IRB.CreateConstGEP1_32(VAArgTLSCopy, AMD64FpEndOffset); IRB.CreateMemCpy(OverflowArgAreaShadowPtr, SrcPtr, VAArgOverflowSize, 16); } } }; -VarArgHelper* CreateVarArgHelper(Function &Func, MemorySanitizer &Msan, +/// \brief A no-op implementation of VarArgHelper. +struct VarArgNoOpHelper : public VarArgHelper { + VarArgNoOpHelper(Function &F, MemorySanitizer &MS, + MemorySanitizerVisitor &MSV) {} + + void visitCallSite(CallSite &CS, IRBuilder<> &IRB) {} + + void visitVAStartInst(VAStartInst &I) {} + + void visitVACopyInst(VACopyInst &I) {} + + void finalizeInstrumentation() {} +}; + +VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan, MemorySanitizerVisitor &Visitor) { - return new VarArgAMD64Helper(Func, Msan, Visitor); + // VarArg handling is only implemented on AMD64. False positives are possible + // on other platforms. + llvm::Triple TargetTriple(Func.getParent()->getTargetTriple()); + if (TargetTriple.getArch() == llvm::Triple::x86_64) + return new VarArgAMD64Helper(Func, Msan, Visitor); + else + return new VarArgNoOpHelper(Func, Msan, Visitor); } } // namespace diff --git a/contrib/llvm/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp deleted file mode 100644 index b45aef65..0000000 --- a/contrib/llvm/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp +++ /dev/null @@ -1,225 +0,0 @@ -//===- OptimalEdgeProfiling.cpp - Insert counters for opt. edge profiling -===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass instruments the specified program with counters for edge profiling. -// Edge profiling can give a reasonable approximation of the hot paths through a -// program, and is used for a wide variety of program transformations. -// -//===----------------------------------------------------------------------===// -#define DEBUG_TYPE "insert-optimal-edge-profiling" -#include "llvm/Transforms/Instrumentation.h" -#include "MaximumSpanningTree.h" -#include "ProfilingUtils.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Passes.h" -#include "llvm/Analysis/ProfileInfo.h" -#include "llvm/Analysis/ProfileInfoLoader.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -using namespace llvm; - -STATISTIC(NumEdgesInserted, "The # of edges inserted."); - -namespace { - class OptimalEdgeProfiler : public ModulePass { - bool runOnModule(Module &M); - public: - static char ID; // Pass identification, replacement for typeid - OptimalEdgeProfiler() : ModulePass(ID) { - initializeOptimalEdgeProfilerPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequiredID(ProfileEstimatorPassID); - AU.addRequired<ProfileInfo>(); - } - - virtual const char *getPassName() const { - return "Optimal Edge Profiler"; - } - }; -} - -char OptimalEdgeProfiler::ID = 0; -INITIALIZE_PASS_BEGIN(OptimalEdgeProfiler, "insert-optimal-edge-profiling", - "Insert optimal instrumentation for edge profiling", - false, false) -INITIALIZE_PASS_DEPENDENCY(ProfileEstimatorPass) -INITIALIZE_AG_DEPENDENCY(ProfileInfo) -INITIALIZE_PASS_END(OptimalEdgeProfiler, "insert-optimal-edge-profiling", - "Insert optimal instrumentation for edge profiling", - false, false) - -ModulePass *llvm::createOptimalEdgeProfilerPass() { - return new OptimalEdgeProfiler(); -} - -inline static void printEdgeCounter(ProfileInfo::Edge e, - BasicBlock* b, - unsigned i) { - DEBUG(dbgs() << "--Edge Counter for " << (e) << " in " \ - << ((b)?(b)->getName():"0") << " (# " << (i) << ")\n"); -} - -bool OptimalEdgeProfiler::runOnModule(Module &M) { - Function *Main = M.getFunction("main"); - if (Main == 0) { - errs() << "WARNING: cannot insert edge profiling into a module" - << " with no main function!\n"; - return false; // No main, no instrumentation! - } - - // NumEdges counts all the edges that may be instrumented. Later on its - // decided which edges to actually instrument, to achieve optimal profiling. - // For the entry block a virtual edge (0,entry) is reserved, for each block - // with no successors an edge (BB,0) is reserved. These edges are necessary - // to calculate a truly optimal maximum spanning tree and thus an optimal - // instrumentation. - unsigned NumEdges = 0; - - for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { - if (F->isDeclaration()) continue; - // Reserve space for (0,entry) edge. - ++NumEdges; - for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { - // Keep track of which blocks need to be instrumented. We don't want to - // instrument blocks that are added as the result of breaking critical - // edges! - if (BB->getTerminator()->getNumSuccessors() == 0) { - // Reserve space for (BB,0) edge. - ++NumEdges; - } else { - NumEdges += BB->getTerminator()->getNumSuccessors(); - } - } - } - - // In the profiling output a counter for each edge is reserved, but only few - // are used. This is done to be able to read back in the profile without - // calulating the maximum spanning tree again, instead each edge counter that - // is not used is initialised with -1 to signal that this edge counter has to - // be calculated from other edge counters on reading the profile info back - // in. - - Type *Int32 = Type::getInt32Ty(M.getContext()); - ArrayType *ATy = ArrayType::get(Int32, NumEdges); - GlobalVariable *Counters = - new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage, - Constant::getNullValue(ATy), "OptEdgeProfCounters"); - NumEdgesInserted = 0; - - std::vector<Constant*> Initializer(NumEdges); - Constant *Zero = ConstantInt::get(Int32, 0); - Constant *Uncounted = ConstantInt::get(Int32, ProfileInfoLoader::Uncounted); - - // Instrument all of the edges not in MST... - unsigned i = 0; - for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { - if (F->isDeclaration()) continue; - DEBUG(dbgs() << "Working on " << F->getName() << "\n"); - - // Calculate a Maximum Spanning Tree with the edge weights determined by - // ProfileEstimator. ProfileEstimator also assign weights to the virtual - // edges (0,entry) and (BB,0) (for blocks with no successors) and this - // edges also participate in the maximum spanning tree calculation. - // The third parameter of MaximumSpanningTree() has the effect that not the - // actual MST is returned but the edges _not_ in the MST. - - ProfileInfo::EdgeWeights ECs = - getAnalysis<ProfileInfo>(*F).getEdgeWeights(F); - std::vector<ProfileInfo::EdgeWeight> EdgeVector(ECs.begin(), ECs.end()); - MaximumSpanningTree<BasicBlock> MST(EdgeVector); - std::stable_sort(MST.begin(), MST.end()); - - // Check if (0,entry) not in the MST. If not, instrument edge - // (IncrementCounterInBlock()) and set the counter initially to zero, if - // the edge is in the MST the counter is initialised to -1. - - BasicBlock *entry = &(F->getEntryBlock()); - ProfileInfo::Edge edge = ProfileInfo::getEdge(0, entry); - if (!std::binary_search(MST.begin(), MST.end(), edge)) { - printEdgeCounter(edge, entry, i); - IncrementCounterInBlock(entry, i, Counters); ++NumEdgesInserted; - Initializer[i++] = (Zero); - } else{ - Initializer[i++] = (Uncounted); - } - - // InsertedBlocks contains all blocks that were inserted for splitting an - // edge, this blocks do not have to be instrumented. - DenseSet<BasicBlock*> InsertedBlocks; - for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { - // Check if block was not inserted and thus does not have to be - // instrumented. - if (InsertedBlocks.count(BB)) continue; - - // Okay, we have to add a counter of each outgoing edge not in MST. If - // the outgoing edge is not critical don't split it, just insert the - // counter in the source or destination of the edge. Also, if the block - // has no successors, the virtual edge (BB,0) is processed. - TerminatorInst *TI = BB->getTerminator(); - if (TI->getNumSuccessors() == 0) { - ProfileInfo::Edge edge = ProfileInfo::getEdge(BB, 0); - if (!std::binary_search(MST.begin(), MST.end(), edge)) { - printEdgeCounter(edge, BB, i); - IncrementCounterInBlock(BB, i, Counters); ++NumEdgesInserted; - Initializer[i++] = (Zero); - } else{ - Initializer[i++] = (Uncounted); - } - } - for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) { - BasicBlock *Succ = TI->getSuccessor(s); - ProfileInfo::Edge edge = ProfileInfo::getEdge(BB,Succ); - if (!std::binary_search(MST.begin(), MST.end(), edge)) { - - // If the edge is critical, split it. - bool wasInserted = SplitCriticalEdge(TI, s, this); - Succ = TI->getSuccessor(s); - if (wasInserted) - InsertedBlocks.insert(Succ); - - // Okay, we are guaranteed that the edge is no longer critical. If - // we only have a single successor, insert the counter in this block, - // otherwise insert it in the successor block. - if (TI->getNumSuccessors() == 1) { - // Insert counter at the start of the block - printEdgeCounter(edge, BB, i); - IncrementCounterInBlock(BB, i, Counters); ++NumEdgesInserted; - } else { - // Insert counter at the start of the block - printEdgeCounter(edge, Succ, i); - IncrementCounterInBlock(Succ, i, Counters); ++NumEdgesInserted; - } - Initializer[i++] = (Zero); - } else { - Initializer[i++] = (Uncounted); - } - } - } - } - - // Check if the number of edges counted at first was the number of edges we - // considered for instrumentation. - assert(i == NumEdges && "the number of edges in counting array is wrong"); - - // Assign the now completely defined initialiser to the array. - Constant *init = ConstantArray::get(ATy, Initializer); - Counters->setInitializer(init); - - // Add the initialization call to main. - InsertProfilingInitCall(Main, "llvm_start_opt_edge_profiling", Counters); - return true; -} - diff --git a/contrib/llvm/lib/Transforms/Instrumentation/PathProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/PathProfiling.cpp deleted file mode 100644 index 7de7326..0000000 --- a/contrib/llvm/lib/Transforms/Instrumentation/PathProfiling.cpp +++ /dev/null @@ -1,1424 +0,0 @@ -//===- PathProfiling.cpp - Inserts counters for path profiling ------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass instruments functions for Ball-Larus path profiling. Ball-Larus -// profiling converts the CFG into a DAG by replacing backedges with edges -// from entry to the start block and from the end block to exit. The paths -// along the new DAG are enumrated, i.e. each path is given a path number. -// Edges are instrumented to increment the path number register, such that the -// path number register will equal the path number of the path taken at the -// exit. -// -// This file defines classes for building a CFG for use with different stages -// in the Ball-Larus path profiling instrumentation [Ball96]. The -// requirements are formatting the llvm CFG into the Ball-Larus DAG, path -// numbering, finding a spanning tree, moving increments from the spanning -// tree to chords. -// -// Terms: -// DAG - Directed Acyclic Graph. -// Ball-Larus DAG - A CFG with an entry node, an exit node, and backedges -// removed in the following manner. For every backedge -// v->w, insert edge ENTRY->w and edge v->EXIT. -// Path Number - The number corresponding to a specific path through a -// Ball-Larus DAG. -// Spanning Tree - A subgraph, S, is a spanning tree if S covers all -// vertices and is a tree. -// Chord - An edge not in the spanning tree. -// -// [Ball96] -// T. Ball and J. R. Larus. "Efficient Path Profiling." -// International Symposium on Microarchitecture, pages 46-57, 1996. -// http://portal.acm.org/citation.cfm?id=243857 -// -// [Ball94] -// Thomas Ball. "Efficiently Counting Program Events with Support for -// On-line queries." -// ACM Transactions on Programmmg Languages and Systems, Vol 16, No 5, -// September 1994, Pages 1399-1410. -//===----------------------------------------------------------------------===// -#define DEBUG_TYPE "insert-path-profiling" - -#include "llvm/Transforms/Instrumentation.h" -#include "ProfilingUtils.h" -#include "llvm/Analysis/PathNumbering.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/InstrTypes.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/TypeBuilder.h" -#include "llvm/Pass.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Compiler.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include <vector> - -#define HASH_THRESHHOLD 100000 - -using namespace llvm; - -namespace { -class BLInstrumentationNode; -class BLInstrumentationEdge; -class BLInstrumentationDag; - -// --------------------------------------------------------------------------- -// BLInstrumentationNode extends BallLarusNode with member used by the -// instrumentation algortihms. -// --------------------------------------------------------------------------- -class BLInstrumentationNode : public BallLarusNode { -public: - // Creates a new BLInstrumentationNode from a BasicBlock. - BLInstrumentationNode(BasicBlock* BB); - - // Get/sets the Value corresponding to the pathNumber register, - // constant or phinode. Used by the instrumentation code to remember - // path number Values. - Value* getStartingPathNumber(); - void setStartingPathNumber(Value* pathNumber); - - Value* getEndingPathNumber(); - void setEndingPathNumber(Value* pathNumber); - - // Get/set the PHINode Instruction for this node. - PHINode* getPathPHI(); - void setPathPHI(PHINode* pathPHI); - -private: - - Value* _startingPathNumber; // The Value for the current pathNumber. - Value* _endingPathNumber; // The Value for the current pathNumber. - PHINode* _pathPHI; // The PHINode for current pathNumber. -}; - -// -------------------------------------------------------------------------- -// BLInstrumentationEdge extends BallLarusEdge with data about the -// instrumentation that will end up on each edge. -// -------------------------------------------------------------------------- -class BLInstrumentationEdge : public BallLarusEdge { -public: - BLInstrumentationEdge(BLInstrumentationNode* source, - BLInstrumentationNode* target); - - // Sets the target node of this edge. Required to split edges. - void setTarget(BallLarusNode* node); - - // Get/set whether edge is in the spanning tree. - bool isInSpanningTree() const; - void setIsInSpanningTree(bool isInSpanningTree); - - // Get/ set whether this edge will be instrumented with a path number - // initialization. - bool isInitialization() const; - void setIsInitialization(bool isInitialization); - - // Get/set whether this edge will be instrumented with a path counter - // increment. Notice this is incrementing the path counter - // corresponding to the path number register. The path number - // increment is determined by getIncrement(). - bool isCounterIncrement() const; - void setIsCounterIncrement(bool isCounterIncrement); - - // Get/set the path number increment that this edge will be instrumented - // with. This is distinct from the path counter increment and the - // weight. The counter increment counts the number of executions of - // some path, whereas the path number keeps track of which path number - // the program is on. - long getIncrement() const; - void setIncrement(long increment); - - // Get/set whether the edge has been instrumented. - bool hasInstrumentation(); - void setHasInstrumentation(bool hasInstrumentation); - - // Returns the successor number of this edge in the source. - unsigned getSuccessorNumber(); - -private: - // The increment that the code will be instrumented with. - long long _increment; - - // Whether this edge is in the spanning tree. - bool _isInSpanningTree; - - // Whether this edge is an initialiation of the path number. - bool _isInitialization; - - // Whether this edge is a path counter increment. - bool _isCounterIncrement; - - // Whether this edge has been instrumented. - bool _hasInstrumentation; -}; - -// --------------------------------------------------------------------------- -// BLInstrumentationDag extends BallLarusDag with algorithms that -// determine where instrumentation should be placed. -// --------------------------------------------------------------------------- -class BLInstrumentationDag : public BallLarusDag { -public: - BLInstrumentationDag(Function &F); - - // Returns the Exit->Root edge. This edge is required for creating - // directed cycles in the algorithm for moving instrumentation off of - // the spanning tree - BallLarusEdge* getExitRootEdge(); - - // Returns an array of phony edges which mark those nodes - // with function calls - BLEdgeVector getCallPhonyEdges(); - - // Gets/sets the path counter array - GlobalVariable* getCounterArray(); - void setCounterArray(GlobalVariable* c); - - // Calculates the increments for the chords, thereby removing - // instrumentation from the spanning tree edges. Implementation is based - // on the algorithm in Figure 4 of [Ball94] - void calculateChordIncrements(); - - // Updates the state when an edge has been split - void splitUpdate(BLInstrumentationEdge* formerEdge, BasicBlock* newBlock); - - // Calculates a spanning tree of the DAG ignoring cycles. Whichever - // edges are in the spanning tree will not be instrumented, but this - // implementation does not try to minimize the instrumentation overhead - // by trying to find hot edges. - void calculateSpanningTree(); - - // Pushes initialization further down in order to group the first - // increment and initialization. - void pushInitialization(); - - // Pushes the path counter increments up in order to group the last path - // number increment. - void pushCounters(); - - // Removes phony edges from the successor list of the source, and the - // predecessor list of the target. - void unlinkPhony(); - - // Generate dot graph for the function - void generateDotGraph(); - -protected: - // BLInstrumentationDag creates BLInstrumentationNode objects in this - // method overriding the creation of BallLarusNode objects. - // - // Allows subclasses to determine which type of Node is created. - // Override this method to produce subclasses of BallLarusNode if - // necessary. - virtual BallLarusNode* createNode(BasicBlock* BB); - - // BLInstrumentationDag create BLInstrumentationEdges. - // - // Allows subclasses to determine which type of Edge is created. - // Override this method to produce subclasses of BallLarusEdge if - // necessary. Parameters source and target will have been created by - // createNode and can be cast to the subclass of BallLarusNode* - // returned by createNode. - virtual BallLarusEdge* createEdge( - BallLarusNode* source, BallLarusNode* target, unsigned edgeNumber); - -private: - BLEdgeVector _treeEdges; // All edges in the spanning tree. - BLEdgeVector _chordEdges; // All edges not in the spanning tree. - GlobalVariable* _counterArray; // Array to store path counters - - // Removes the edge from the appropriate predecessor and successor lists. - void unlinkEdge(BallLarusEdge* edge); - - // Makes an edge part of the spanning tree. - void makeEdgeSpanning(BLInstrumentationEdge* edge); - - // Pushes initialization and calls itself recursively. - void pushInitializationFromEdge(BLInstrumentationEdge* edge); - - // Pushes path counter increments up recursively. - void pushCountersFromEdge(BLInstrumentationEdge* edge); - - // Depth first algorithm for determining the chord increments.f - void calculateChordIncrementsDfs( - long weight, BallLarusNode* v, BallLarusEdge* e); - - // Determines the relative direction of two edges. - int calculateChordIncrementsDir(BallLarusEdge* e, BallLarusEdge* f); -}; - -// --------------------------------------------------------------------------- -// PathProfiler is a module pass which instruments path profiling instructions -// --------------------------------------------------------------------------- -class PathProfiler : public ModulePass { -private: - // Current context for multi threading support. - LLVMContext* Context; - - // Which function are we currently instrumenting - unsigned currentFunctionNumber; - - // The function prototype in the profiling runtime for incrementing a - // single path counter in a hash table. - Constant* llvmIncrementHashFunction; - Constant* llvmDecrementHashFunction; - - // Instruments each function with path profiling. 'main' is instrumented - // with code to save the profile to disk. - bool runOnModule(Module &M); - - // Analyzes the function for Ball-Larus path profiling, and inserts code. - void runOnFunction(std::vector<Constant*> &ftInit, Function &F, Module &M); - - // Creates an increment constant representing incr. - ConstantInt* createIncrementConstant(long incr, int bitsize); - - // Creates an increment constant representing the value in - // edge->getIncrement(). - ConstantInt* createIncrementConstant(BLInstrumentationEdge* edge); - - // Finds the insertion point after pathNumber in block. PathNumber may - // be NULL. - BasicBlock::iterator getInsertionPoint( - BasicBlock* block, Value* pathNumber); - - // Inserts source's pathNumber Value* into target. Target may or may not - // have multiple predecessors, and may or may not have its phiNode - // initalized. - void pushValueIntoNode( - BLInstrumentationNode* source, BLInstrumentationNode* target); - - // Inserts source's pathNumber Value* into the appropriate slot of - // target's phiNode. - void pushValueIntoPHI( - BLInstrumentationNode* target, BLInstrumentationNode* source); - - // The Value* in node, oldVal, is updated with a Value* correspodning to - // oldVal + addition. - void insertNumberIncrement(BLInstrumentationNode* node, Value* addition, - bool atBeginning); - - // Creates a counter increment in the given node. The Value* in node is - // taken as the index into a hash table. - void insertCounterIncrement( - Value* incValue, - BasicBlock::iterator insertPoint, - BLInstrumentationDag* dag, - bool increment = true); - - // A PHINode is created in the node, and its values initialized to -1U. - void preparePHI(BLInstrumentationNode* node); - - // Inserts instrumentation for the given edge - // - // Pre: The edge's source node has pathNumber set if edge is non zero - // path number increment. - // - // Post: Edge's target node has a pathNumber set to the path number Value - // corresponding to the value of the path register after edge's - // execution. - void insertInstrumentationStartingAt( - BLInstrumentationEdge* edge, - BLInstrumentationDag* dag); - - // If this edge is a critical edge, then inserts a node at this edge. - // This edge becomes the first edge, and a new BallLarusEdge is created. - bool splitCritical(BLInstrumentationEdge* edge, BLInstrumentationDag* dag); - - // Inserts instrumentation according to the marked edges in dag. Phony - // edges must be unlinked from the DAG, but accessible from the - // backedges. Dag must have initializations, path number increments, and - // counter increments present. - // - // Counter storage is created here. - void insertInstrumentation( BLInstrumentationDag& dag, Module &M); - -public: - static char ID; // Pass identification, replacement for typeid - PathProfiler() : ModulePass(ID) { - initializePathProfilerPass(*PassRegistry::getPassRegistry()); - } - - virtual const char *getPassName() const { - return "Path Profiler"; - } -}; -} // end anonymous namespace - -// Should we print the dot-graphs -static cl::opt<bool> DotPathDag("path-profile-pathdag", cl::Hidden, - cl::desc("Output the path profiling DAG for each function.")); - -// Register the path profiler as a pass -char PathProfiler::ID = 0; -INITIALIZE_PASS(PathProfiler, "insert-path-profiling", - "Insert instrumentation for Ball-Larus path profiling", - false, false) - -ModulePass *llvm::createPathProfilerPass() { return new PathProfiler(); } - -namespace llvm { - class PathProfilingFunctionTable {}; - - // Type for global array storing references to hashes or arrays - template<bool xcompile> class TypeBuilder<PathProfilingFunctionTable, - xcompile> { - public: - static StructType *get(LLVMContext& C) { - return( StructType::get( - TypeBuilder<types::i<32>, xcompile>::get(C), // type - TypeBuilder<types::i<32>, xcompile>::get(C), // array size - TypeBuilder<types::i<8>*, xcompile>::get(C), // array/hash ptr - NULL)); - } - }; - - typedef TypeBuilder<PathProfilingFunctionTable, true> - ftEntryTypeBuilder; - - // BallLarusEdge << operator overloading - raw_ostream& operator<<(raw_ostream& os, - const BLInstrumentationEdge& edge) - LLVM_ATTRIBUTE_USED; - raw_ostream& operator<<(raw_ostream& os, - const BLInstrumentationEdge& edge) { - os << "[" << edge.getSource()->getName() << " -> " - << edge.getTarget()->getName() << "] init: " - << (edge.isInitialization() ? "yes" : "no") - << " incr:" << edge.getIncrement() << " cinc: " - << (edge.isCounterIncrement() ? "yes" : "no"); - return(os); - } -} - -// Creates a new BLInstrumentationNode from a BasicBlock. -BLInstrumentationNode::BLInstrumentationNode(BasicBlock* BB) : - BallLarusNode(BB), - _startingPathNumber(NULL), _endingPathNumber(NULL), _pathPHI(NULL) {} - -// Constructor for BLInstrumentationEdge. -BLInstrumentationEdge::BLInstrumentationEdge(BLInstrumentationNode* source, - BLInstrumentationNode* target) - : BallLarusEdge(source, target, 0), - _increment(0), _isInSpanningTree(false), _isInitialization(false), - _isCounterIncrement(false), _hasInstrumentation(false) {} - -// Sets the target node of this edge. Required to split edges. -void BLInstrumentationEdge::setTarget(BallLarusNode* node) { - _target = node; -} - -// Returns whether this edge is in the spanning tree. -bool BLInstrumentationEdge::isInSpanningTree() const { - return(_isInSpanningTree); -} - -// Sets whether this edge is in the spanning tree. -void BLInstrumentationEdge::setIsInSpanningTree(bool isInSpanningTree) { - _isInSpanningTree = isInSpanningTree; -} - -// Returns whether this edge will be instrumented with a path number -// initialization. -bool BLInstrumentationEdge::isInitialization() const { - return(_isInitialization); -} - -// Sets whether this edge will be instrumented with a path number -// initialization. -void BLInstrumentationEdge::setIsInitialization(bool isInitialization) { - _isInitialization = isInitialization; -} - -// Returns whether this edge will be instrumented with a path counter -// increment. Notice this is incrementing the path counter -// corresponding to the path number register. The path number -// increment is determined by getIncrement(). -bool BLInstrumentationEdge::isCounterIncrement() const { - return(_isCounterIncrement); -} - -// Sets whether this edge will be instrumented with a path counter -// increment. -void BLInstrumentationEdge::setIsCounterIncrement(bool isCounterIncrement) { - _isCounterIncrement = isCounterIncrement; -} - -// Gets the path number increment that this edge will be instrumented -// with. This is distinct from the path counter increment and the -// weight. The counter increment is counts the number of executions of -// some path, whereas the path number keeps track of which path number -// the program is on. -long BLInstrumentationEdge::getIncrement() const { - return(_increment); -} - -// Set whether this edge will be instrumented with a path number -// increment. -void BLInstrumentationEdge::setIncrement(long increment) { - _increment = increment; -} - -// True iff the edge has already been instrumented. -bool BLInstrumentationEdge::hasInstrumentation() { - return(_hasInstrumentation); -} - -// Set whether this edge has been instrumented. -void BLInstrumentationEdge::setHasInstrumentation(bool hasInstrumentation) { - _hasInstrumentation = hasInstrumentation; -} - -// Returns the successor number of this edge in the source. -unsigned BLInstrumentationEdge::getSuccessorNumber() { - BallLarusNode* sourceNode = getSource(); - BallLarusNode* targetNode = getTarget(); - BasicBlock* source = sourceNode->getBlock(); - BasicBlock* target = targetNode->getBlock(); - - if(source == NULL || target == NULL) - return(0); - - TerminatorInst* terminator = source->getTerminator(); - - unsigned i; - for(i=0; i < terminator->getNumSuccessors(); i++) { - if(terminator->getSuccessor(i) == target) - break; - } - - return(i); -} - -// BLInstrumentationDag constructor initializes a DAG for the given Function. -BLInstrumentationDag::BLInstrumentationDag(Function &F) : BallLarusDag(F), - _counterArray(0) { -} - -// Returns the Exit->Root edge. This edge is required for creating -// directed cycles in the algorithm for moving instrumentation off of -// the spanning tree -BallLarusEdge* BLInstrumentationDag::getExitRootEdge() { - BLEdgeIterator erEdge = getExit()->succBegin(); - return(*erEdge); -} - -BLEdgeVector BLInstrumentationDag::getCallPhonyEdges () { - BLEdgeVector callEdges; - - for( BLEdgeIterator edge = _edges.begin(), end = _edges.end(); - edge != end; edge++ ) { - if( (*edge)->getType() == BallLarusEdge::CALLEDGE_PHONY ) - callEdges.push_back(*edge); - } - - return callEdges; -} - -// Gets the path counter array -GlobalVariable* BLInstrumentationDag::getCounterArray() { - return _counterArray; -} - -void BLInstrumentationDag::setCounterArray(GlobalVariable* c) { - _counterArray = c; -} - -// Calculates the increment for the chords, thereby removing -// instrumentation from the spanning tree edges. Implementation is based on -// the algorithm in Figure 4 of [Ball94] -void BLInstrumentationDag::calculateChordIncrements() { - calculateChordIncrementsDfs(0, getRoot(), NULL); - - BLInstrumentationEdge* chord; - for(BLEdgeIterator chordEdge = _chordEdges.begin(), - end = _chordEdges.end(); chordEdge != end; chordEdge++) { - chord = (BLInstrumentationEdge*) *chordEdge; - chord->setIncrement(chord->getIncrement() + chord->getWeight()); - } -} - -// Updates the state when an edge has been split -void BLInstrumentationDag::splitUpdate(BLInstrumentationEdge* formerEdge, - BasicBlock* newBlock) { - BallLarusNode* oldTarget = formerEdge->getTarget(); - BallLarusNode* newNode = addNode(newBlock); - formerEdge->setTarget(newNode); - newNode->addPredEdge(formerEdge); - - DEBUG(dbgs() << " Edge split: " << *formerEdge << "\n"); - - oldTarget->removePredEdge(formerEdge); - BallLarusEdge* newEdge = addEdge(newNode, oldTarget,0); - - if( formerEdge->getType() == BallLarusEdge::BACKEDGE || - formerEdge->getType() == BallLarusEdge::SPLITEDGE) { - newEdge->setType(formerEdge->getType()); - newEdge->setPhonyRoot(formerEdge->getPhonyRoot()); - newEdge->setPhonyExit(formerEdge->getPhonyExit()); - formerEdge->setType(BallLarusEdge::NORMAL); - formerEdge->setPhonyRoot(NULL); - formerEdge->setPhonyExit(NULL); - } -} - -// Calculates a spanning tree of the DAG ignoring cycles. Whichever -// edges are in the spanning tree will not be instrumented, but this -// implementation does not try to minimize the instrumentation overhead -// by trying to find hot edges. -void BLInstrumentationDag::calculateSpanningTree() { - std::stack<BallLarusNode*> dfsStack; - - for(BLNodeIterator nodeIt = _nodes.begin(), end = _nodes.end(); - nodeIt != end; nodeIt++) { - (*nodeIt)->setColor(BallLarusNode::WHITE); - } - - dfsStack.push(getRoot()); - while(dfsStack.size() > 0) { - BallLarusNode* node = dfsStack.top(); - dfsStack.pop(); - - if(node->getColor() == BallLarusNode::WHITE) - continue; - - BallLarusNode* nextNode; - bool forward = true; - BLEdgeIterator succEnd = node->succEnd(); - - node->setColor(BallLarusNode::WHITE); - // first iterate over successors then predecessors - for(BLEdgeIterator edge = node->succBegin(), predEnd = node->predEnd(); - edge != predEnd; edge++) { - if(edge == succEnd) { - edge = node->predBegin(); - forward = false; - } - - // Ignore split edges - if ((*edge)->getType() == BallLarusEdge::SPLITEDGE) - continue; - - nextNode = forward? (*edge)->getTarget(): (*edge)->getSource(); - if(nextNode->getColor() != BallLarusNode::WHITE) { - nextNode->setColor(BallLarusNode::WHITE); - makeEdgeSpanning((BLInstrumentationEdge*)(*edge)); - } - } - } - - for(BLEdgeIterator edge = _edges.begin(), end = _edges.end(); - edge != end; edge++) { - BLInstrumentationEdge* instEdge = (BLInstrumentationEdge*) (*edge); - // safe since createEdge is overriden - if(!instEdge->isInSpanningTree() && (*edge)->getType() - != BallLarusEdge::SPLITEDGE) - _chordEdges.push_back(instEdge); - } -} - -// Pushes initialization further down in order to group the first -// increment and initialization. -void BLInstrumentationDag::pushInitialization() { - BLInstrumentationEdge* exitRootEdge = - (BLInstrumentationEdge*) getExitRootEdge(); - exitRootEdge->setIsInitialization(true); - pushInitializationFromEdge(exitRootEdge); -} - -// Pushes the path counter increments up in order to group the last path -// number increment. -void BLInstrumentationDag::pushCounters() { - BLInstrumentationEdge* exitRootEdge = - (BLInstrumentationEdge*) getExitRootEdge(); - exitRootEdge->setIsCounterIncrement(true); - pushCountersFromEdge(exitRootEdge); -} - -// Removes phony edges from the successor list of the source, and the -// predecessor list of the target. -void BLInstrumentationDag::unlinkPhony() { - BallLarusEdge* edge; - - for(BLEdgeIterator next = _edges.begin(), - end = _edges.end(); next != end; next++) { - edge = (*next); - - if( edge->getType() == BallLarusEdge::BACKEDGE_PHONY || - edge->getType() == BallLarusEdge::SPLITEDGE_PHONY || - edge->getType() == BallLarusEdge::CALLEDGE_PHONY ) { - unlinkEdge(edge); - } - } -} - -// Generate a .dot graph to represent the DAG and pathNumbers -void BLInstrumentationDag::generateDotGraph() { - std::string errorInfo; - std::string functionName = getFunction().getName().str(); - std::string filename = "pathdag." + functionName + ".dot"; - - DEBUG (dbgs() << "Writing '" << filename << "'...\n"); - raw_fd_ostream dotFile(filename.c_str(), errorInfo); - - if (!errorInfo.empty()) { - errs() << "Error opening '" << filename.c_str() <<"' for writing!"; - errs() << "\n"; - return; - } - - dotFile << "digraph " << functionName << " {\n"; - - for( BLEdgeIterator edge = _edges.begin(), end = _edges.end(); - edge != end; edge++) { - std::string sourceName = (*edge)->getSource()->getName(); - std::string targetName = (*edge)->getTarget()->getName(); - - dotFile << "\t\"" << sourceName.c_str() << "\" -> \"" - << targetName.c_str() << "\" "; - - long inc = ((BLInstrumentationEdge*)(*edge))->getIncrement(); - - switch( (*edge)->getType() ) { - case BallLarusEdge::NORMAL: - dotFile << "[label=" << inc << "] [color=black];\n"; - break; - - case BallLarusEdge::BACKEDGE: - dotFile << "[color=cyan];\n"; - break; - - case BallLarusEdge::BACKEDGE_PHONY: - dotFile << "[label=" << inc - << "] [color=blue];\n"; - break; - - case BallLarusEdge::SPLITEDGE: - dotFile << "[color=violet];\n"; - break; - - case BallLarusEdge::SPLITEDGE_PHONY: - dotFile << "[label=" << inc << "] [color=red];\n"; - break; - - case BallLarusEdge::CALLEDGE_PHONY: - dotFile << "[label=" << inc << "] [color=green];\n"; - break; - } - } - - dotFile << "}\n"; -} - -// Allows subclasses to determine which type of Node is created. -// Override this method to produce subclasses of BallLarusNode if -// necessary. The destructor of BallLarusDag will call free on each pointer -// created. -BallLarusNode* BLInstrumentationDag::createNode(BasicBlock* BB) { - return( new BLInstrumentationNode(BB) ); -} - -// Allows subclasses to determine which type of Edge is created. -// Override this method to produce subclasses of BallLarusEdge if -// necessary. The destructor of BallLarusDag will call free on each pointer -// created. -BallLarusEdge* BLInstrumentationDag::createEdge(BallLarusNode* source, - BallLarusNode* target, unsigned edgeNumber) { - // One can cast from BallLarusNode to BLInstrumentationNode since createNode - // is overriden to produce BLInstrumentationNode. - return( new BLInstrumentationEdge((BLInstrumentationNode*)source, - (BLInstrumentationNode*)target) ); -} - -// Sets the Value corresponding to the pathNumber register, constant, -// or phinode. Used by the instrumentation code to remember path -// number Values. -Value* BLInstrumentationNode::getStartingPathNumber(){ - return(_startingPathNumber); -} - -// Sets the Value of the pathNumber. Used by the instrumentation code. -void BLInstrumentationNode::setStartingPathNumber(Value* pathNumber) { - DEBUG(dbgs() << " SPN-" << getName() << " <-- " << (pathNumber ? - pathNumber->getName() : - "unused") << "\n"); - _startingPathNumber = pathNumber; -} - -Value* BLInstrumentationNode::getEndingPathNumber(){ - return(_endingPathNumber); -} - -void BLInstrumentationNode::setEndingPathNumber(Value* pathNumber) { - DEBUG(dbgs() << " EPN-" << getName() << " <-- " - << (pathNumber ? pathNumber->getName() : "unused") << "\n"); - _endingPathNumber = pathNumber; -} - -// Get the PHINode Instruction for this node. Used by instrumentation -// code. -PHINode* BLInstrumentationNode::getPathPHI() { - return(_pathPHI); -} - -// Set the PHINode Instruction for this node. Used by instrumentation -// code. -void BLInstrumentationNode::setPathPHI(PHINode* pathPHI) { - _pathPHI = pathPHI; -} - -// Removes the edge from the appropriate predecessor and successor -// lists. -void BLInstrumentationDag::unlinkEdge(BallLarusEdge* edge) { - if(edge == getExitRootEdge()) - DEBUG(dbgs() << " Removing exit->root edge\n"); - - edge->getSource()->removeSuccEdge(edge); - edge->getTarget()->removePredEdge(edge); -} - -// Makes an edge part of the spanning tree. -void BLInstrumentationDag::makeEdgeSpanning(BLInstrumentationEdge* edge) { - edge->setIsInSpanningTree(true); - _treeEdges.push_back(edge); -} - -// Pushes initialization and calls itself recursively. -void BLInstrumentationDag::pushInitializationFromEdge( - BLInstrumentationEdge* edge) { - BallLarusNode* target; - - target = edge->getTarget(); - if( target->getNumberPredEdges() > 1 || target == getExit() ) { - return; - } else { - for(BLEdgeIterator next = target->succBegin(), - end = target->succEnd(); next != end; next++) { - BLInstrumentationEdge* intoEdge = (BLInstrumentationEdge*) *next; - - // Skip split edges - if (intoEdge->getType() == BallLarusEdge::SPLITEDGE) - continue; - - intoEdge->setIncrement(intoEdge->getIncrement() + - edge->getIncrement()); - intoEdge->setIsInitialization(true); - pushInitializationFromEdge(intoEdge); - } - - edge->setIncrement(0); - edge->setIsInitialization(false); - } -} - -// Pushes path counter increments up recursively. -void BLInstrumentationDag::pushCountersFromEdge(BLInstrumentationEdge* edge) { - BallLarusNode* source; - - source = edge->getSource(); - if(source->getNumberSuccEdges() > 1 || source == getRoot() - || edge->isInitialization()) { - return; - } else { - for(BLEdgeIterator previous = source->predBegin(), - end = source->predEnd(); previous != end; previous++) { - BLInstrumentationEdge* fromEdge = (BLInstrumentationEdge*) *previous; - - // Skip split edges - if (fromEdge->getType() == BallLarusEdge::SPLITEDGE) - continue; - - fromEdge->setIncrement(fromEdge->getIncrement() + - edge->getIncrement()); - fromEdge->setIsCounterIncrement(true); - pushCountersFromEdge(fromEdge); - } - - edge->setIncrement(0); - edge->setIsCounterIncrement(false); - } -} - -// Depth first algorithm for determining the chord increments. -void BLInstrumentationDag::calculateChordIncrementsDfs(long weight, - BallLarusNode* v, BallLarusEdge* e) { - BLInstrumentationEdge* f; - - for(BLEdgeIterator treeEdge = _treeEdges.begin(), - end = _treeEdges.end(); treeEdge != end; treeEdge++) { - f = (BLInstrumentationEdge*) *treeEdge; - if(e != f && v == f->getTarget()) { - calculateChordIncrementsDfs( - calculateChordIncrementsDir(e,f)*(weight) + - f->getWeight(), f->getSource(), f); - } - if(e != f && v == f->getSource()) { - calculateChordIncrementsDfs( - calculateChordIncrementsDir(e,f)*(weight) + - f->getWeight(), f->getTarget(), f); - } - } - - for(BLEdgeIterator chordEdge = _chordEdges.begin(), - end = _chordEdges.end(); chordEdge != end; chordEdge++) { - f = (BLInstrumentationEdge*) *chordEdge; - if(v == f->getSource() || v == f->getTarget()) { - f->setIncrement(f->getIncrement() + - calculateChordIncrementsDir(e,f)*weight); - } - } -} - -// Determines the relative direction of two edges. -int BLInstrumentationDag::calculateChordIncrementsDir(BallLarusEdge* e, - BallLarusEdge* f) { - if( e == NULL) - return(1); - else if(e->getSource() == f->getTarget() - || e->getTarget() == f->getSource()) - return(1); - - return(-1); -} - -// Creates an increment constant representing incr. -ConstantInt* PathProfiler::createIncrementConstant(long incr, - int bitsize) { - return(ConstantInt::get(IntegerType::get(*Context, 32), incr)); -} - -// Creates an increment constant representing the value in -// edge->getIncrement(). -ConstantInt* PathProfiler::createIncrementConstant( - BLInstrumentationEdge* edge) { - return(createIncrementConstant(edge->getIncrement(), 32)); -} - -// Finds the insertion point after pathNumber in block. PathNumber may -// be NULL. -BasicBlock::iterator PathProfiler::getInsertionPoint(BasicBlock* block, Value* - pathNumber) { - if(pathNumber == NULL || isa<ConstantInt>(pathNumber) - || (((Instruction*)(pathNumber))->getParent()) != block) { - return(block->getFirstInsertionPt()); - } else { - Instruction* pathNumberInst = (Instruction*) (pathNumber); - BasicBlock::iterator insertPoint; - BasicBlock::iterator end = block->end(); - - for(insertPoint = block->begin(); - insertPoint != end; insertPoint++) { - Instruction* insertInst = &(*insertPoint); - - if(insertInst == pathNumberInst) - return(++insertPoint); - } - - return(insertPoint); - } -} - -// A PHINode is created in the node, and its values initialized to -1U. -void PathProfiler::preparePHI(BLInstrumentationNode* node) { - BasicBlock* block = node->getBlock(); - BasicBlock::iterator insertPoint = block->getFirstInsertionPt(); - pred_iterator PB = pred_begin(node->getBlock()), - PE = pred_end(node->getBlock()); - PHINode* phi = PHINode::Create(Type::getInt32Ty(*Context), - std::distance(PB, PE), "pathNumber", - insertPoint ); - node->setPathPHI(phi); - node->setStartingPathNumber(phi); - node->setEndingPathNumber(phi); - - for(pred_iterator predIt = PB; predIt != PE; predIt++) { - BasicBlock* pred = (*predIt); - - if(pred != NULL) - phi->addIncoming(createIncrementConstant((long)-1, 32), pred); - } -} - -// Inserts source's pathNumber Value* into target. Target may or may not -// have multiple predecessors, and may or may not have its phiNode -// initalized. -void PathProfiler::pushValueIntoNode(BLInstrumentationNode* source, - BLInstrumentationNode* target) { - if(target->getBlock() == NULL) - return; - - - if(target->getNumberPredEdges() <= 1) { - assert(target->getStartingPathNumber() == NULL && - "Target already has path number"); - target->setStartingPathNumber(source->getEndingPathNumber()); - target->setEndingPathNumber(source->getEndingPathNumber()); - DEBUG(dbgs() << " Passing path number" - << (source->getEndingPathNumber() ? "" : " (null)") - << " value through.\n"); - } else { - if(target->getPathPHI() == NULL) { - DEBUG(dbgs() << " Initializing PHI node for block '" - << target->getName() << "'\n"); - preparePHI(target); - } - pushValueIntoPHI(target, source); - DEBUG(dbgs() << " Passing number value into PHI for block '" - << target->getName() << "'\n"); - } -} - -// Inserts source's pathNumber Value* into the appropriate slot of -// target's phiNode. -void PathProfiler::pushValueIntoPHI(BLInstrumentationNode* target, - BLInstrumentationNode* source) { - PHINode* phi = target->getPathPHI(); - assert(phi != NULL && " Tried to push value into node with PHI, but node" - " actually had no PHI."); - phi->removeIncomingValue(source->getBlock(), false); - phi->addIncoming(source->getEndingPathNumber(), source->getBlock()); -} - -// The Value* in node, oldVal, is updated with a Value* correspodning to -// oldVal + addition. -void PathProfiler::insertNumberIncrement(BLInstrumentationNode* node, - Value* addition, bool atBeginning) { - BasicBlock* block = node->getBlock(); - assert(node->getStartingPathNumber() != NULL); - assert(node->getEndingPathNumber() != NULL); - - BasicBlock::iterator insertPoint; - - if( atBeginning ) - insertPoint = block->getFirstInsertionPt(); - else - insertPoint = block->getTerminator(); - - DEBUG(errs() << " Creating addition instruction.\n"); - Value* newpn = BinaryOperator::Create(Instruction::Add, - node->getStartingPathNumber(), - addition, "pathNumber", insertPoint); - - node->setEndingPathNumber(newpn); - - if( atBeginning ) - node->setStartingPathNumber(newpn); -} - -// Creates a counter increment in the given node. The Value* in node is -// taken as the index into an array or hash table. The hash table access -// is a call to the runtime. -void PathProfiler::insertCounterIncrement(Value* incValue, - BasicBlock::iterator insertPoint, - BLInstrumentationDag* dag, - bool increment) { - // Counter increment for array - if( dag->getNumberOfPaths() <= HASH_THRESHHOLD ) { - // Get pointer to the array location - std::vector<Value*> gepIndices(2); - gepIndices[0] = Constant::getNullValue(Type::getInt32Ty(*Context)); - gepIndices[1] = incValue; - - GetElementPtrInst* pcPointer = - GetElementPtrInst::Create(dag->getCounterArray(), gepIndices, - "counterInc", insertPoint); - - // Load from the array - call it oldPC - LoadInst* oldPc = new LoadInst(pcPointer, "oldPC", insertPoint); - - // Test to see whether adding 1 will overflow the counter - ICmpInst* isMax = new ICmpInst(insertPoint, CmpInst::ICMP_ULT, oldPc, - createIncrementConstant(0xffffffff, 32), - "isMax"); - - // Select increment for the path counter based on overflow - SelectInst* inc = - SelectInst::Create( isMax, createIncrementConstant(increment?1:-1,32), - createIncrementConstant(0,32), - "pathInc", insertPoint); - - // newPc = oldPc + inc - BinaryOperator* newPc = BinaryOperator::Create(Instruction::Add, - oldPc, inc, "newPC", - insertPoint); - - // Store back in to the array - new StoreInst(newPc, pcPointer, insertPoint); - } else { // Counter increment for hash - std::vector<Value*> args(2); - args[0] = ConstantInt::get(Type::getInt32Ty(*Context), - currentFunctionNumber); - args[1] = incValue; - - CallInst::Create( - increment ? llvmIncrementHashFunction : llvmDecrementHashFunction, - args, "", insertPoint); - } -} - -// Inserts instrumentation for the given edge -// -// Pre: The edge's source node has pathNumber set if edge is non zero -// path number increment. -// -// Post: Edge's target node has a pathNumber set to the path number Value -// corresponding to the value of the path register after edge's -// execution. -// -// FIXME: This should be reworked so it's not recursive. -void PathProfiler::insertInstrumentationStartingAt(BLInstrumentationEdge* edge, - BLInstrumentationDag* dag) { - // Mark the edge as instrumented - edge->setHasInstrumentation(true); - DEBUG(dbgs() << "\nInstrumenting edge: " << (*edge) << "\n"); - - // create a new node for this edge's instrumentation - splitCritical(edge, dag); - - BLInstrumentationNode* sourceNode = (BLInstrumentationNode*)edge->getSource(); - BLInstrumentationNode* targetNode = (BLInstrumentationNode*)edge->getTarget(); - BLInstrumentationNode* instrumentNode; - BLInstrumentationNode* nextSourceNode; - - bool atBeginning = false; - - // Source node has only 1 successor so any information can be simply - // inserted in to it without splitting - if( sourceNode->getBlock() && sourceNode->getNumberSuccEdges() <= 1) { - DEBUG(dbgs() << " Potential instructions to be placed in: " - << sourceNode->getName() << " (at end)\n"); - instrumentNode = sourceNode; - nextSourceNode = targetNode; // ... since we never made any new nodes - } - - // The target node only has one predecessor, so we can safely insert edge - // instrumentation into it. If there was splitting, it must have been - // successful. - else if( targetNode->getNumberPredEdges() == 1 ) { - DEBUG(dbgs() << " Potential instructions to be placed in: " - << targetNode->getName() << " (at beginning)\n"); - pushValueIntoNode(sourceNode, targetNode); - instrumentNode = targetNode; - nextSourceNode = NULL; // ... otherwise we'll just keep splitting - atBeginning = true; - } - - // Somehow, splitting must have failed. - else { - errs() << "Instrumenting could not split a critical edge.\n"; - DEBUG(dbgs() << " Couldn't split edge " << (*edge) << ".\n"); - return; - } - - // Insert instrumentation if this is a back or split edge - if( edge->getType() == BallLarusEdge::BACKEDGE || - edge->getType() == BallLarusEdge::SPLITEDGE ) { - BLInstrumentationEdge* top = - (BLInstrumentationEdge*) edge->getPhonyRoot(); - BLInstrumentationEdge* bottom = - (BLInstrumentationEdge*) edge->getPhonyExit(); - - assert( top->isInitialization() && " Top phony edge did not" - " contain a path number initialization."); - assert( bottom->isCounterIncrement() && " Bottom phony edge" - " did not contain a path counter increment."); - - // split edge has yet to be initialized - if( !instrumentNode->getEndingPathNumber() ) { - instrumentNode->setStartingPathNumber(createIncrementConstant(0,32)); - instrumentNode->setEndingPathNumber(createIncrementConstant(0,32)); - } - - BasicBlock::iterator insertPoint = atBeginning ? - instrumentNode->getBlock()->getFirstInsertionPt() : - instrumentNode->getBlock()->getTerminator(); - - // add information from the bottom edge, if it exists - if( bottom->getIncrement() ) { - Value* newpn = - BinaryOperator::Create(Instruction::Add, - instrumentNode->getStartingPathNumber(), - createIncrementConstant(bottom), - "pathNumber", insertPoint); - instrumentNode->setEndingPathNumber(newpn); - } - - insertCounterIncrement(instrumentNode->getEndingPathNumber(), - insertPoint, dag); - - if( atBeginning ) - instrumentNode->setStartingPathNumber(createIncrementConstant(top)); - - instrumentNode->setEndingPathNumber(createIncrementConstant(top)); - - // Check for path counter increments - if( top->isCounterIncrement() ) { - insertCounterIncrement(instrumentNode->getEndingPathNumber(), - instrumentNode->getBlock()->getTerminator(),dag); - instrumentNode->setEndingPathNumber(0); - } - } - - // Insert instrumentation if this is a normal edge - else { - BasicBlock::iterator insertPoint = atBeginning ? - instrumentNode->getBlock()->getFirstInsertionPt() : - instrumentNode->getBlock()->getTerminator(); - - if( edge->isInitialization() ) { // initialize path number - instrumentNode->setEndingPathNumber(createIncrementConstant(edge)); - } else if( edge->getIncrement() ) {// increment path number - Value* newpn = - BinaryOperator::Create(Instruction::Add, - instrumentNode->getStartingPathNumber(), - createIncrementConstant(edge), - "pathNumber", insertPoint); - instrumentNode->setEndingPathNumber(newpn); - - if( atBeginning ) - instrumentNode->setStartingPathNumber(newpn); - } - - // Check for path counter increments - if( edge->isCounterIncrement() ) { - insertCounterIncrement(instrumentNode->getEndingPathNumber(), - insertPoint, dag); - instrumentNode->setEndingPathNumber(0); - } - } - - // Push it along - if (nextSourceNode && instrumentNode->getEndingPathNumber()) - pushValueIntoNode(instrumentNode, nextSourceNode); - - // Add all the successors - for( BLEdgeIterator next = targetNode->succBegin(), - end = targetNode->succEnd(); next != end; next++ ) { - // So long as it is un-instrumented, add it to the list - if( !((BLInstrumentationEdge*)(*next))->hasInstrumentation() ) - insertInstrumentationStartingAt((BLInstrumentationEdge*)*next,dag); - else - DEBUG(dbgs() << " Edge " << *(BLInstrumentationEdge*)(*next) - << " already instrumented.\n"); - } -} - -// Inserts instrumentation according to the marked edges in dag. Phony edges -// must be unlinked from the DAG, but accessible from the backedges. Dag -// must have initializations, path number increments, and counter increments -// present. -// -// Counter storage is created here. -void PathProfiler::insertInstrumentation( - BLInstrumentationDag& dag, Module &M) { - - BLInstrumentationEdge* exitRootEdge = - (BLInstrumentationEdge*) dag.getExitRootEdge(); - insertInstrumentationStartingAt(exitRootEdge, &dag); - - // Iterate through each call edge and apply the appropriate hash increment - // and decrement functions - BLEdgeVector callEdges = dag.getCallPhonyEdges(); - for( BLEdgeIterator edge = callEdges.begin(), - end = callEdges.end(); edge != end; edge++ ) { - BLInstrumentationNode* node = - (BLInstrumentationNode*)(*edge)->getSource(); - BasicBlock::iterator insertPoint = node->getBlock()->getFirstInsertionPt(); - - // Find the first function call - while( ((Instruction&)(*insertPoint)).getOpcode() != Instruction::Call ) - insertPoint++; - - DEBUG(dbgs() << "\nInstrumenting method call block '" - << node->getBlock()->getName() << "'\n"); - DEBUG(dbgs() << " Path number initialized: " - << ((node->getStartingPathNumber()) ? "yes" : "no") << "\n"); - - Value* newpn; - if( node->getStartingPathNumber() ) { - long inc = ((BLInstrumentationEdge*)(*edge))->getIncrement(); - if ( inc ) - newpn = BinaryOperator::Create(Instruction::Add, - node->getStartingPathNumber(), - createIncrementConstant(inc,32), - "pathNumber", insertPoint); - else - newpn = node->getStartingPathNumber(); - } else { - newpn = (Value*)createIncrementConstant( - ((BLInstrumentationEdge*)(*edge))->getIncrement(), 32); - } - - insertCounterIncrement(newpn, insertPoint, &dag); - insertCounterIncrement(newpn, node->getBlock()->getTerminator(), - &dag, false); - } -} - -// Entry point of the module -void PathProfiler::runOnFunction(std::vector<Constant*> &ftInit, - Function &F, Module &M) { - // Build DAG from CFG - BLInstrumentationDag dag = BLInstrumentationDag(F); - dag.init(); - - // give each path a unique integer value - dag.calculatePathNumbers(); - - // modify path increments to increase the efficiency - // of instrumentation - dag.calculateSpanningTree(); - dag.calculateChordIncrements(); - dag.pushInitialization(); - dag.pushCounters(); - dag.unlinkPhony(); - - // potentially generate .dot graph for the dag - if (DotPathDag) - dag.generateDotGraph (); - - // Should we store the information in an array or hash - if( dag.getNumberOfPaths() <= HASH_THRESHHOLD ) { - Type* t = ArrayType::get(Type::getInt32Ty(*Context), - dag.getNumberOfPaths()); - - dag.setCounterArray(new GlobalVariable(M, t, false, - GlobalValue::InternalLinkage, - Constant::getNullValue(t), "")); - } - - insertInstrumentation(dag, M); - - // Add to global function reference table - unsigned type; - Type* voidPtr = TypeBuilder<types::i<8>*, true>::get(*Context); - - if( dag.getNumberOfPaths() <= HASH_THRESHHOLD ) - type = ProfilingArray; - else - type = ProfilingHash; - - std::vector<Constant*> entryArray(3); - entryArray[0] = createIncrementConstant(type,32); - entryArray[1] = createIncrementConstant(dag.getNumberOfPaths(),32); - entryArray[2] = dag.getCounterArray() ? - ConstantExpr::getBitCast(dag.getCounterArray(), voidPtr) : - Constant::getNullValue(voidPtr); - - StructType* at = ftEntryTypeBuilder::get(*Context); - ConstantStruct* functionEntry = - (ConstantStruct*)ConstantStruct::get(at, entryArray); - ftInit.push_back(functionEntry); -} - -// Output the bitcode if we want to observe instrumentation changess -#define PRINT_MODULE dbgs() << \ - "\n\n============= MODULE BEGIN ===============\n" << M << \ - "\n============== MODULE END ================\n" - -bool PathProfiler::runOnModule(Module &M) { - Context = &M.getContext(); - - DEBUG(dbgs() - << "****************************************\n" - << "****************************************\n" - << "** **\n" - << "** PATH PROFILING INSTRUMENTATION **\n" - << "** **\n" - << "****************************************\n" - << "****************************************\n"); - - // No main, no instrumentation! - Function *Main = M.getFunction("main"); - - // Using fortran? ... this kind of works - if (!Main) - Main = M.getFunction("MAIN__"); - - if (!Main) { - errs() << "WARNING: cannot insert path profiling into a module" - << " with no main function!\n"; - return false; - } - - llvmIncrementHashFunction = M.getOrInsertFunction( - "llvm_increment_path_count", - Type::getVoidTy(*Context), // return type - Type::getInt32Ty(*Context), // function number - Type::getInt32Ty(*Context), // path number - NULL ); - - llvmDecrementHashFunction = M.getOrInsertFunction( - "llvm_decrement_path_count", - Type::getVoidTy(*Context), // return type - Type::getInt32Ty(*Context), // function number - Type::getInt32Ty(*Context), // path number - NULL ); - - std::vector<Constant*> ftInit; - unsigned functionNumber = 0; - for (Module::iterator F = M.begin(), E = M.end(); F != E; F++) { - if (F->isDeclaration()) - continue; - - DEBUG(dbgs() << "Function: " << F->getName() << "\n"); - functionNumber++; - - // set function number - currentFunctionNumber = functionNumber; - runOnFunction(ftInit, *F, M); - } - - Type *t = ftEntryTypeBuilder::get(*Context); - ArrayType* ftArrayType = ArrayType::get(t, ftInit.size()); - Constant* ftInitConstant = ConstantArray::get(ftArrayType, ftInit); - - DEBUG(dbgs() << " ftArrayType:" << *ftArrayType << "\n"); - - GlobalVariable* functionTable = - new GlobalVariable(M, ftArrayType, false, GlobalValue::InternalLinkage, - ftInitConstant, "functionPathTable"); - Type *eltType = ftArrayType->getTypeAtIndex((unsigned)0); - InsertProfilingInitCall(Main, "llvm_start_path_profiling", functionTable, - PointerType::getUnqual(eltType)); - - DEBUG(PRINT_MODULE); - - return true; -} - -// If this edge is a critical edge, then inserts a node at this edge. -// This edge becomes the first edge, and a new BallLarusEdge is created. -// Returns true if the edge was split -bool PathProfiler::splitCritical(BLInstrumentationEdge* edge, - BLInstrumentationDag* dag) { - unsigned succNum = edge->getSuccessorNumber(); - BallLarusNode* sourceNode = edge->getSource(); - BallLarusNode* targetNode = edge->getTarget(); - BasicBlock* sourceBlock = sourceNode->getBlock(); - BasicBlock* targetBlock = targetNode->getBlock(); - - if(sourceBlock == NULL || targetBlock == NULL - || sourceNode->getNumberSuccEdges() <= 1 - || targetNode->getNumberPredEdges() == 1 ) { - return(false); - } - - TerminatorInst* terminator = sourceBlock->getTerminator(); - - if( SplitCriticalEdge(terminator, succNum, this, false)) { - BasicBlock* newBlock = terminator->getSuccessor(succNum); - dag->splitUpdate(edge, newBlock); - return(true); - } else - return(false); -} diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.cpp deleted file mode 100644 index 4b3de6d..0000000 --- a/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.cpp +++ /dev/null @@ -1,169 +0,0 @@ -//===- ProfilingUtils.cpp - Helper functions shared by profilers ----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements a few helper functions which are used by profile -// instrumentation code to instrument the code. This allows the profiler pass -// to worry about *what* to insert, and these functions take care of *how* to do -// it. -// -//===----------------------------------------------------------------------===// - -#include "ProfilingUtils.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" - -void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName, - GlobalValue *Array, - PointerType *arrayType) { - LLVMContext &Context = MainFn->getContext(); - Type *ArgVTy = - PointerType::getUnqual(Type::getInt8PtrTy(Context)); - PointerType *UIntPtr = arrayType ? arrayType : - Type::getInt32PtrTy(Context); - Module &M = *MainFn->getParent(); - Constant *InitFn = M.getOrInsertFunction(FnName, Type::getInt32Ty(Context), - Type::getInt32Ty(Context), - ArgVTy, UIntPtr, - Type::getInt32Ty(Context), - (Type *)0); - - // This could force argc and argv into programs that wouldn't otherwise have - // them, but instead we just pass null values in. - std::vector<Value*> Args(4); - Args[0] = Constant::getNullValue(Type::getInt32Ty(Context)); - Args[1] = Constant::getNullValue(ArgVTy); - - // Skip over any allocas in the entry block. - BasicBlock *Entry = MainFn->begin(); - BasicBlock::iterator InsertPos = Entry->begin(); - while (isa<AllocaInst>(InsertPos)) ++InsertPos; - - std::vector<Constant*> GEPIndices(2, - Constant::getNullValue(Type::getInt32Ty(Context))); - unsigned NumElements = 0; - if (Array) { - Args[2] = ConstantExpr::getGetElementPtr(Array, GEPIndices); - NumElements = - cast<ArrayType>(Array->getType()->getElementType())->getNumElements(); - } else { - // If this profiling instrumentation doesn't have a constant array, just - // pass null. - Args[2] = ConstantPointerNull::get(UIntPtr); - } - Args[3] = ConstantInt::get(Type::getInt32Ty(Context), NumElements); - - CallInst *InitCall = CallInst::Create(InitFn, Args, "newargc", InsertPos); - - // If argc or argv are not available in main, just pass null values in. - Function::arg_iterator AI; - switch (MainFn->arg_size()) { - default: - case 2: - AI = MainFn->arg_begin(); ++AI; - if (AI->getType() != ArgVTy) { - Instruction::CastOps opcode = CastInst::getCastOpcode(AI, false, ArgVTy, - false); - InitCall->setArgOperand(1, - CastInst::Create(opcode, AI, ArgVTy, "argv.cast", InitCall)); - } else { - InitCall->setArgOperand(1, AI); - } - /* FALL THROUGH */ - - case 1: - AI = MainFn->arg_begin(); - // If the program looked at argc, have it look at the return value of the - // init call instead. - if (!AI->getType()->isIntegerTy(32)) { - Instruction::CastOps opcode; - if (!AI->use_empty()) { - opcode = CastInst::getCastOpcode(InitCall, true, AI->getType(), true); - AI->replaceAllUsesWith( - CastInst::Create(opcode, InitCall, AI->getType(), "", InsertPos)); - } - opcode = CastInst::getCastOpcode(AI, true, - Type::getInt32Ty(Context), true); - InitCall->setArgOperand(0, - CastInst::Create(opcode, AI, Type::getInt32Ty(Context), - "argc.cast", InitCall)); - } else { - AI->replaceAllUsesWith(InitCall); - InitCall->setArgOperand(0, AI); - } - - case 0: break; - } -} - -void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum, - GlobalValue *CounterArray, bool beginning) { - // Insert the increment after any alloca or PHI instructions... - BasicBlock::iterator InsertPos = beginning ? BB->getFirstInsertionPt() : - BB->getTerminator(); - while (isa<AllocaInst>(InsertPos)) - ++InsertPos; - - LLVMContext &Context = BB->getContext(); - - // Create the getelementptr constant expression - std::vector<Constant*> Indices(2); - Indices[0] = Constant::getNullValue(Type::getInt32Ty(Context)); - Indices[1] = ConstantInt::get(Type::getInt32Ty(Context), CounterNum); - Constant *ElementPtr = - ConstantExpr::getGetElementPtr(CounterArray, Indices); - - // Load, increment and store the value back. - Value *OldVal = new LoadInst(ElementPtr, "OldFuncCounter", InsertPos); - Value *NewVal = BinaryOperator::Create(Instruction::Add, OldVal, - ConstantInt::get(Type::getInt32Ty(Context), 1), - "NewFuncCounter", InsertPos); - new StoreInst(NewVal, ElementPtr, InsertPos); -} - -void llvm::InsertProfilingShutdownCall(Function *Callee, Module *Mod) { - // llvm.global_dtors is an array of type { i32, void ()* }. Prepare those - // types. - Type *GlobalDtorElems[2] = { - Type::getInt32Ty(Mod->getContext()), - FunctionType::get(Type::getVoidTy(Mod->getContext()), false)->getPointerTo() - }; - StructType *GlobalDtorElemTy = - StructType::get(Mod->getContext(), GlobalDtorElems, false); - - // Construct the new element we'll be adding. - Constant *Elem[2] = { - ConstantInt::get(Type::getInt32Ty(Mod->getContext()), 65535), - ConstantExpr::getBitCast(Callee, GlobalDtorElems[1]) - }; - - // If llvm.global_dtors exists, make a copy of the things in its list and - // delete it, to replace it with one that has a larger array type. - std::vector<Constant *> dtors; - if (GlobalVariable *GlobalDtors = Mod->getNamedGlobal("llvm.global_dtors")) { - if (ConstantArray *InitList = - dyn_cast<ConstantArray>(GlobalDtors->getInitializer())) { - for (unsigned i = 0, e = InitList->getType()->getNumElements(); - i != e; ++i) - dtors.push_back(cast<Constant>(InitList->getOperand(i))); - } - GlobalDtors->eraseFromParent(); - } - - // Build up llvm.global_dtors with our new item in it. - GlobalVariable *GlobalDtors = new GlobalVariable( - *Mod, ArrayType::get(GlobalDtorElemTy, 1), false, - GlobalValue::AppendingLinkage, NULL, "llvm.global_dtors"); - - dtors.push_back(ConstantStruct::get(GlobalDtorElemTy, Elem)); - GlobalDtors->setInitializer(ConstantArray::get( - cast<ArrayType>(GlobalDtors->getType()->getElementType()), dtors)); -} diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.h b/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.h deleted file mode 100644 index 09b2217..0000000 --- a/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.h +++ /dev/null @@ -1,36 +0,0 @@ -//===- ProfilingUtils.h - Helper functions shared by profilers --*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines a few helper functions which are used by profile -// instrumentation code to instrument the code. This allows the profiler pass -// to worry about *what* to insert, and these functions take care of *how* to do -// it. -// -//===----------------------------------------------------------------------===// - -#ifndef PROFILINGUTILS_H -#define PROFILINGUTILS_H - -namespace llvm { - class BasicBlock; - class Function; - class GlobalValue; - class Module; - class PointerType; - - void InsertProfilingInitCall(Function *MainFn, const char *FnName, - GlobalValue *Arr = 0, - PointerType *arrayType = 0); - void IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum, - GlobalValue *CounterArray, - bool beginning = true); - void InsertProfilingShutdownCall(Function *Callee, Module *Mod); -} - -#endif diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 299060a..89fb746 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -41,8 +41,8 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/BlackList.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Transforms/Utils/SpecialCaseList.h" using namespace llvm; @@ -99,7 +99,7 @@ struct ThreadSanitizer : public FunctionPass { DataLayout *TD; Type *IntptrTy; SmallString<64> BlacklistFile; - OwningPtr<BlackList> BL; + OwningPtr<SpecialCaseList> BL; IntegerType *OrdTy; // Callbacks to run-time library are computed in doInitialization. Function *TsanFuncEntry; @@ -227,7 +227,7 @@ bool ThreadSanitizer::doInitialization(Module &M) { TD = getAnalysisIfAvailable<DataLayout>(); if (!TD) return false; - BL.reset(new BlackList(BlacklistFile)); + BL.reset(SpecialCaseList::createOrDie(BlacklistFile)); // Always insert a call to __tsan_init into the module's CTORs. IRBuilder<> IRB(M.getContext()); @@ -240,12 +240,8 @@ bool ThreadSanitizer::doInitialization(Module &M) { } static bool isVtableAccess(Instruction *I) { - if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa)) { - if (Tag->getNumOperands() < 1) return false; - if (MDString *Tag1 = dyn_cast<MDString>(Tag->getOperand(0))) { - if (Tag1->getString() == "vtable pointer") return true; - } - } + if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa)) + return Tag->isTBAAVtableAccess(); return false; } @@ -362,7 +358,7 @@ bool ThreadSanitizer::runOnFunction(Function &F) { // (e.g. variables that do not escape, etc). // Instrument memory accesses. - if (ClInstrumentMemoryAccesses) + if (ClInstrumentMemoryAccesses && F.hasFnAttribute(Attribute::SanitizeThread)) for (size_t i = 0, n = AllLoadsAndStores.size(); i < n; ++i) { Res |= instrumentLoadOrStore(AllLoadsAndStores[i]); } @@ -579,7 +575,7 @@ int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr) { // Ignore all unusual sizes. return -1; } - size_t Idx = CountTrailingZeros_32(TypeSize / 8); + size_t Idx = countTrailingZeros(TypeSize / 8); assert(Idx < kNumberOfAccessSizes); return Idx; } diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h new file mode 100644 index 0000000..4eac39d --- /dev/null +++ b/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h @@ -0,0 +1,186 @@ +//===- ARCRuntimeEntryPoints.h - ObjC ARC Optimization --*- C++ -*---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file contains a class ARCRuntimeEntryPoints for use in +/// creating/managing references to entry points to the arc objective c runtime. +/// +/// WARNING: This file knows about certain library functions. It recognizes them +/// by name, and hardwires knowledge of their semantics. +/// +/// WARNING: This file knows about how certain Objective-C library functions are +/// used. Naive LLVM IR transformations which would otherwise be +/// behavior-preserving may break these assumptions. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_ARCRUNTIMEENTRYPOINTS_H +#define LLVM_TRANSFORMS_SCALAR_ARCRUNTIMEENTRYPOINTS_H + +#include "ObjCARC.h" + +namespace llvm { +namespace objcarc { + +/// Declarations for ObjC runtime functions and constants. These are initialized +/// lazily to avoid cluttering up the Module with unused declarations. +class ARCRuntimeEntryPoints { +public: + enum EntryPointType { + EPT_AutoreleaseRV, + EPT_Release, + EPT_Retain, + EPT_RetainBlock, + EPT_Autorelease, + EPT_StoreStrong, + EPT_RetainRV, + EPT_RetainAutorelease, + EPT_RetainAutoreleaseRV + }; + + ARCRuntimeEntryPoints() : TheModule(0), + AutoreleaseRV(0), + Release(0), + Retain(0), + RetainBlock(0), + Autorelease(0), + StoreStrong(0), + RetainRV(0), + RetainAutorelease(0), + RetainAutoreleaseRV(0) { } + + ~ARCRuntimeEntryPoints() { } + + void Initialize(Module *M) { + TheModule = M; + AutoreleaseRV = 0; + Release = 0; + Retain = 0; + RetainBlock = 0; + Autorelease = 0; + StoreStrong = 0; + RetainRV = 0; + RetainAutorelease = 0; + RetainAutoreleaseRV = 0; + } + + Constant *get(const EntryPointType entry) { + assert(TheModule != 0 && "Not initialized."); + + switch (entry) { + case EPT_AutoreleaseRV: + return getI8XRetI8XEntryPoint(AutoreleaseRV, + "objc_autoreleaseReturnValue", true); + case EPT_Release: + return getVoidRetI8XEntryPoint(Release, "objc_release"); + case EPT_Retain: + return getI8XRetI8XEntryPoint(Retain, "objc_retain", true); + case EPT_RetainBlock: + return getI8XRetI8XEntryPoint(RetainBlock, "objc_retainBlock", false); + case EPT_Autorelease: + return getI8XRetI8XEntryPoint(Autorelease, "objc_autorelease", true); + case EPT_StoreStrong: + return getI8XRetI8XXI8XEntryPoint(StoreStrong, "objc_storeStrong"); + case EPT_RetainRV: + return getI8XRetI8XEntryPoint(RetainRV, + "objc_retainAutoreleasedReturnValue", true); + case EPT_RetainAutorelease: + return getI8XRetI8XEntryPoint(RetainAutorelease, "objc_retainAutorelease", + true); + case EPT_RetainAutoreleaseRV: + return getI8XRetI8XEntryPoint(RetainAutoreleaseRV, + "objc_retainAutoreleaseReturnValue", true); + } + + llvm_unreachable("Switch should be a covered switch."); + } + +private: + /// Cached reference to the module which we will insert declarations into. + Module *TheModule; + + /// Declaration for ObjC runtime function objc_autoreleaseReturnValue. + Constant *AutoreleaseRV; + /// Declaration for ObjC runtime function objc_release. + Constant *Release; + /// Declaration for ObjC runtime function objc_retain. + Constant *Retain; + /// Declaration for ObjC runtime function objc_retainBlock. + Constant *RetainBlock; + /// Declaration for ObjC runtime function objc_autorelease. + Constant *Autorelease; + /// Declaration for objc_storeStrong(). + Constant *StoreStrong; + /// Declaration for objc_retainAutoreleasedReturnValue(). + Constant *RetainRV; + /// Declaration for objc_retainAutorelease(). + Constant *RetainAutorelease; + /// Declaration for objc_retainAutoreleaseReturnValue(). + Constant *RetainAutoreleaseRV; + + Constant *getVoidRetI8XEntryPoint(Constant *&Decl, + const char *Name) { + if (Decl) + return Decl; + + LLVMContext &C = TheModule->getContext(); + Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; + AttributeSet Attr = + AttributeSet().addAttribute(C, AttributeSet::FunctionIndex, + Attribute::NoUnwind); + FunctionType *Fty = FunctionType::get(Type::getVoidTy(C), Params, + /*isVarArg=*/false); + return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr); + } + + Constant *getI8XRetI8XEntryPoint(Constant *& Decl, + const char *Name, + bool NoUnwind = false) { + if (Decl) + return Decl; + + LLVMContext &C = TheModule->getContext(); + Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); + Type *Params[] = { I8X }; + FunctionType *Fty = FunctionType::get(I8X, Params, /*isVarArg=*/false); + AttributeSet Attr = AttributeSet(); + + if (NoUnwind) + Attr = Attr.addAttribute(C, AttributeSet::FunctionIndex, + Attribute::NoUnwind); + + return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr); + } + + Constant *getI8XRetI8XXI8XEntryPoint(Constant *&Decl, + const char *Name) { + if (Decl) + return Decl; + + LLVMContext &C = TheModule->getContext(); + Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); + Type *I8XX = PointerType::getUnqual(I8X); + Type *Params[] = { I8XX, I8X }; + + AttributeSet Attr = + AttributeSet().addAttribute(C, AttributeSet::FunctionIndex, + Attribute::NoUnwind); + Attr = Attr.addAttribute(C, 1, Attribute::NoCapture); + + FunctionType *Fty = FunctionType::get(Type::getVoidTy(C), Params, + /*isVarArg=*/false); + + return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr); + } + +}; // class ARCRuntimeEntryPoints + +} // namespace objcarc +} // namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_ARCRUNTIMEENTRYPOINTS_H diff --git a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h index 24d358b..617cdf3 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h @@ -1,4 +1,4 @@ -//===- DependencyAnalysis.h - ObjC ARC Optimization ---*- mode: c++ -*-----===// +//===- DependencyAnalysis.h - ObjC ARC Optimization ---*- C++ -*-----------===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h index 39670f3..8044494 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h @@ -1,4 +1,4 @@ -//===- ObjCARC.h - ObjC ARC Optimization --------------*- mode: c++ -*-----===// +//===- ObjCARC.h - ObjC ARC Optimization --------------*- C++ -*-----------===// // // The LLVM Compiler Infrastructure // @@ -286,7 +286,9 @@ static inline void EraseInstruction(Instruction *CI) { if (!Unused) { // Replace the return value with the argument. - assert(IsForwarding(GetBasicInstructionClass(CI)) && + assert((IsForwarding(GetBasicInstructionClass(CI)) || + (IsNoopOnNull(GetBasicInstructionClass(CI)) && + isa<ConstantPointerNull>(OldArg))) && "Can't delete non-forwarding instruction with users!"); CI->replaceAllUsesWith(OldArg); } diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp index 46b2de7..d18667b 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp @@ -1,4 +1,4 @@ -//===- ObjCARCAliasAnalysis.cpp - ObjC ARC Optimization -*- mode: c++ -*---===// +//===- ObjCARCAliasAnalysis.cpp - ObjC ARC Optimization -------------------===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h index 7abe995..41ccfe2 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h @@ -1,4 +1,4 @@ -//===- ObjCARCAliasAnalysis.h - ObjC ARC Optimization -*- mode: c++ -*-----===// +//===- ObjCARCAliasAnalysis.h - ObjC ARC Optimization -*- C++ -*-----------===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp index c43f4f4..9d80037 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -28,6 +28,7 @@ #define DEBUG_TYPE "objc-arc-contract" #include "ObjCARC.h" +#include "ARCRuntimeEntryPoints.h" #include "DependencyAnalysis.h" #include "ProvenanceAnalysis.h" #include "llvm/ADT/Statistic.h" @@ -52,23 +53,11 @@ namespace { AliasAnalysis *AA; DominatorTree *DT; ProvenanceAnalysis PA; + ARCRuntimeEntryPoints EP; /// A flag indicating whether this optimization pass should run. bool Run; - /// Declarations for ObjC runtime functions, for use in creating calls to - /// them. These are initialized lazily to avoid cluttering up the Module - /// with unused declarations. - - /// Declaration for objc_storeStrong(). - Constant *StoreStrongCallee; - /// Declaration for objc_retainAutorelease(). - Constant *RetainAutoreleaseCallee; - /// Declaration for objc_retainAutoreleaseReturnValue(). - Constant *RetainAutoreleaseRVCallee; - /// Declaration for objc_retainAutoreleasedReturnValue(). - Constant *RetainRVCallee; - /// The inline asm string to insert between calls and RetainRV calls to make /// the optimization work on targets which need it. const MDString *RetainRVMarker; @@ -78,11 +67,6 @@ namespace { /// "tail". SmallPtrSet<CallInst *, 8> StoreStrongCalls; - Constant *getStoreStrongCallee(Module *M); - Constant *getRetainRVCallee(Module *M); - Constant *getRetainAutoreleaseCallee(Module *M); - Constant *getRetainAutoreleaseRVCallee(Module *M); - bool OptimizeRetainCall(Function &F, Instruction *Retain); bool ContractAutorelease(Function &F, Instruction *Autorelease, @@ -125,74 +109,6 @@ void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); } -Constant *ObjCARCContract::getStoreStrongCallee(Module *M) { - if (!StoreStrongCallee) { - LLVMContext &C = M->getContext(); - Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - Type *I8XX = PointerType::getUnqual(I8X); - Type *Params[] = { I8XX, I8X }; - - AttributeSet Attr = AttributeSet() - .addAttribute(M->getContext(), AttributeSet::FunctionIndex, - Attribute::NoUnwind) - .addAttribute(M->getContext(), 1, Attribute::NoCapture); - - StoreStrongCallee = - M->getOrInsertFunction( - "objc_storeStrong", - FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false), - Attr); - } - return StoreStrongCallee; -} - -Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) { - if (!RetainAutoreleaseCallee) { - LLVMContext &C = M->getContext(); - Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - Type *Params[] = { I8X }; - FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttributeSet Attribute = - AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex, - Attribute::NoUnwind); - RetainAutoreleaseCallee = - M->getOrInsertFunction("objc_retainAutorelease", FTy, Attribute); - } - return RetainAutoreleaseCallee; -} - -Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) { - if (!RetainAutoreleaseRVCallee) { - LLVMContext &C = M->getContext(); - Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - Type *Params[] = { I8X }; - FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttributeSet Attribute = - AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex, - Attribute::NoUnwind); - RetainAutoreleaseRVCallee = - M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy, - Attribute); - } - return RetainAutoreleaseRVCallee; -} - -Constant *ObjCARCContract::getRetainRVCallee(Module *M) { - if (!RetainRVCallee) { - LLVMContext &C = M->getContext(); - Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - Type *Params[] = { I8X }; - FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttributeSet Attribute = - AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex, - Attribute::NoUnwind); - RetainRVCallee = - M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy, - Attribute); - } - return RetainRVCallee; -} - /// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a /// return value. We do this late so we do not disrupt the dataflow analysis in /// ObjCARCOpt. @@ -222,7 +138,8 @@ ObjCARCContract::OptimizeRetainCall(Function &F, Instruction *Retain) { // We do not have to worry about tail calls/does not throw since // retain/retainRV have the same properties. - cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent())); + Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_RetainRV); + cast<CallInst>(Retain)->setCalledFunction(Decl); DEBUG(dbgs() << "New: " << *Retain << "\n"); return true; @@ -272,10 +189,10 @@ ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease, " Old Retain: " << *Retain << "\n"); - if (Class == IC_AutoreleaseRV) - Retain->setCalledFunction(getRetainAutoreleaseRVCallee(F.getParent())); - else - Retain->setCalledFunction(getRetainAutoreleaseCallee(F.getParent())); + Constant *Decl = EP.get(Class == IC_AutoreleaseRV ? + ARCRuntimeEntryPoints::EPT_RetainAutoreleaseRV : + ARCRuntimeEntryPoints::EPT_RetainAutorelease); + Retain->setCalledFunction(Decl); DEBUG(dbgs() << " New Retain: " << *Retain << "\n"); @@ -356,9 +273,8 @@ void ObjCARCContract::ContractRelease(Instruction *Release, Args[0] = new BitCastInst(Args[0], I8XX, "", Store); if (Args[1]->getType() != I8X) Args[1] = new BitCastInst(Args[1], I8X, "", Store); - CallInst *StoreStrong = - CallInst::Create(getStoreStrongCallee(BB->getParent()->getParent()), - Args, "", Store); + Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_StoreStrong); + CallInst *StoreStrong = CallInst::Create(Decl, Args, "", Store); StoreStrong->setDoesNotThrow(); StoreStrong->setDebugLoc(Store->getDebugLoc()); @@ -381,11 +297,7 @@ bool ObjCARCContract::doInitialization(Module &M) { if (!Run) return false; - // These are initialized lazily. - StoreStrongCallee = 0; - RetainAutoreleaseCallee = 0; - RetainAutoreleaseRVCallee = 0; - RetainRVCallee = 0; + EP.Initialize(&M); // Initialize RetainRVMarker. RetainRVMarker = 0; diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 43e2e20..2976df6 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -26,10 +26,12 @@ #define DEBUG_TYPE "objc-arc-opts" #include "ObjCARC.h" +#include "ARCRuntimeEntryPoints.h" #include "DependencyAnalysis.h" #include "ObjCARCAliasAnalysis.h" #include "ProvenanceAnalysis.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" @@ -107,6 +109,12 @@ namespace { return std::make_pair(Vector.begin() + Pair.first->second, false); } + iterator find(const KeyT &Key) { + typename MapTy::iterator It = Map.find(Key); + if (It == Map.end()) return Vector.end(); + return Vector.begin() + It->second; + } + const_iterator find(const KeyT &Key) const { typename MapTy::const_iterator It = Map.find(Key); if (It == Map.end()) return Vector.end(); @@ -168,91 +176,40 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) { return 0; } -/// \brief Test whether the given retainable object pointer escapes. -/// -/// This differs from regular escape analysis in that a use as an -/// argument to a call is not considered an escape. -/// -static bool DoesRetainableObjPtrEscape(const User *Ptr) { - DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Target: " << *Ptr << "\n"); - - // Walk the def-use chains. +/// This is a wrapper around getUnderlyingObjCPtr along the lines of +/// GetUnderlyingObjects except that it returns early when it sees the first +/// alloca. +static inline bool AreAnyUnderlyingObjectsAnAlloca(const Value *V) { + SmallPtrSet<const Value *, 4> Visited; SmallVector<const Value *, 4> Worklist; - Worklist.push_back(Ptr); - // If Ptr has any operands add them as well. - for (User::const_op_iterator I = Ptr->op_begin(), E = Ptr->op_end(); I != E; - ++I) { - Worklist.push_back(*I); - } - - // Ensure we do not visit any value twice. - SmallPtrSet<const Value *, 8> VisitedSet; - + Worklist.push_back(V); do { - const Value *V = Worklist.pop_back_val(); + const Value *P = Worklist.pop_back_val(); + P = GetUnderlyingObjCPtr(P); - DEBUG(dbgs() << "Visiting: " << *V << "\n"); + if (isa<AllocaInst>(P)) + return true; - for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end(); - UI != UE; ++UI) { - const User *UUser = *UI; + if (!Visited.insert(P)) + continue; - DEBUG(dbgs() << "User: " << *UUser << "\n"); + if (const SelectInst *SI = dyn_cast<const SelectInst>(P)) { + Worklist.push_back(SI->getTrueValue()); + Worklist.push_back(SI->getFalseValue()); + continue; + } - // Special - Use by a call (callee or argument) is not considered - // to be an escape. - switch (GetBasicInstructionClass(UUser)) { - case IC_StoreWeak: - case IC_InitWeak: - case IC_StoreStrong: - case IC_Autorelease: - case IC_AutoreleaseRV: { - DEBUG(dbgs() << "User copies pointer arguments. Pointer Escapes!\n"); - // These special functions make copies of their pointer arguments. - return true; - } - case IC_IntrinsicUser: - // Use by the use intrinsic is not an escape. - continue; - case IC_User: - case IC_None: - // Use by an instruction which copies the value is an escape if the - // result is an escape. - if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) || - isa<PHINode>(UUser) || isa<SelectInst>(UUser)) { - - if (VisitedSet.insert(UUser)) { - DEBUG(dbgs() << "User copies value. Ptr escapes if result escapes." - " Adding to list.\n"); - Worklist.push_back(UUser); - } else { - DEBUG(dbgs() << "Already visited node.\n"); - } - continue; - } - // Use by a load is not an escape. - if (isa<LoadInst>(UUser)) - continue; - // Use by a store is not an escape if the use is the address. - if (const StoreInst *SI = dyn_cast<StoreInst>(UUser)) - if (V != SI->getValueOperand()) - continue; - break; - default: - // Regular calls and other stuff are not considered escapes. - continue; - } - // Otherwise, conservatively assume an escape. - DEBUG(dbgs() << "Assuming ptr escapes.\n"); - return true; + if (const PHINode *PN = dyn_cast<const PHINode>(P)) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + Worklist.push_back(PN->getIncomingValue(i)); + continue; } } while (!Worklist.empty()); - // No escapes found. - DEBUG(dbgs() << "Ptr does not escape.\n"); return false; } + /// @} /// /// \defgroup ARCOpt ARC Optimization. @@ -300,18 +257,18 @@ STATISTIC(NumNoops, "Number of no-op objc calls eliminated"); STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated"); STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases"); STATISTIC(NumRets, "Number of return value forwarding " - "retain+autoreleaes eliminated"); + "retain+autoreleases eliminated"); STATISTIC(NumRRs, "Number of retain+release paths eliminated"); STATISTIC(NumPeeps, "Number of calls peephole-optimized"); +#ifndef NDEBUG STATISTIC(NumRetainsBeforeOpt, - "Number of retains before optimization."); + "Number of retains before optimization"); STATISTIC(NumReleasesBeforeOpt, - "Number of releases before optimization."); -#ifndef NDEBUG + "Number of releases before optimization"); STATISTIC(NumRetainsAfterOpt, - "Number of retains after optimization."); + "Number of retains after optimization"); STATISTIC(NumReleasesAfterOpt, - "Number of releases after optimization."); + "Number of releases after optimization"); #endif namespace { @@ -414,14 +371,20 @@ namespace { /// sequence. SmallPtrSet<Instruction *, 2> ReverseInsertPts; + /// If this is true, we cannot perform code motion but can still remove + /// retain/release pairs. + bool CFGHazardAfflicted; + RRInfo() : - KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(0) {} + KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(0), + CFGHazardAfflicted(false) {} void clear(); - bool IsTrackingImpreciseReleases() { - return ReleaseMetadata != 0; - } + /// Conservatively merge the two RRInfo. Returns true if a partial merge has + /// occured, false otherwise. + bool Merge(const RRInfo &Other); + }; } @@ -431,6 +394,30 @@ void RRInfo::clear() { ReleaseMetadata = 0; Calls.clear(); ReverseInsertPts.clear(); + CFGHazardAfflicted = false; +} + +bool RRInfo::Merge(const RRInfo &Other) { + // Conservatively merge the ReleaseMetadata information. + if (ReleaseMetadata != Other.ReleaseMetadata) + ReleaseMetadata = 0; + + // Conservatively merge the boolean state. + KnownSafe &= Other.KnownSafe; + IsTailCallRelease &= Other.IsTailCallRelease; + CFGHazardAfflicted |= Other.CFGHazardAfflicted; + + // Merge the call sets. + Calls.insert(Other.Calls.begin(), Other.Calls.end()); + + // Merge the insert point sets. If there are any differences, + // that makes this a partial merge. + bool Partial = ReverseInsertPts.size() != Other.ReverseInsertPts.size(); + for (SmallPtrSet<Instruction *, 2>::const_iterator + I = Other.ReverseInsertPts.begin(), + E = Other.ReverseInsertPts.end(); I != E; ++I) + Partial |= ReverseInsertPts.insert(*I); + return Partial; } namespace { @@ -445,22 +432,59 @@ namespace { bool Partial; /// The current position in the sequence. - Sequence Seq : 8; + unsigned char Seq : 8; - public: /// Unidirectional information about the current sequence. - /// - /// TODO: Encapsulate this better. RRInfo RRI; + public: PtrState() : KnownPositiveRefCount(false), Partial(false), Seq(S_None) {} + + bool IsKnownSafe() const { + return RRI.KnownSafe; + } + + void SetKnownSafe(const bool NewValue) { + RRI.KnownSafe = NewValue; + } + + bool IsTailCallRelease() const { + return RRI.IsTailCallRelease; + } + + void SetTailCallRelease(const bool NewValue) { + RRI.IsTailCallRelease = NewValue; + } + + bool IsTrackingImpreciseReleases() const { + return RRI.ReleaseMetadata != 0; + } + + const MDNode *GetReleaseMetadata() const { + return RRI.ReleaseMetadata; + } + + void SetReleaseMetadata(MDNode *NewValue) { + RRI.ReleaseMetadata = NewValue; + } + + bool IsCFGHazardAfflicted() const { + return RRI.CFGHazardAfflicted; + } + + void SetCFGHazardAfflicted(const bool NewValue) { + RRI.CFGHazardAfflicted = NewValue; + } + void SetKnownPositiveRefCount() { + DEBUG(dbgs() << "Setting Known Positive.\n"); KnownPositiveRefCount = true; } void ClearKnownPositiveRefCount() { + DEBUG(dbgs() << "Clearing Known Positive.\n"); KnownPositiveRefCount = false; } @@ -474,7 +498,7 @@ namespace { } Sequence GetSeq() const { - return Seq; + return static_cast<Sequence>(Seq); } void ClearSequenceProgress() { @@ -489,13 +513,34 @@ namespace { } void Merge(const PtrState &Other, bool TopDown); + + void InsertCall(Instruction *I) { + RRI.Calls.insert(I); + } + + void InsertReverseInsertPt(Instruction *I) { + RRI.ReverseInsertPts.insert(I); + } + + void ClearReverseInsertPts() { + RRI.ReverseInsertPts.clear(); + } + + bool HasReverseInsertPts() const { + return !RRI.ReverseInsertPts.empty(); + } + + const RRInfo &GetRRInfo() const { + return RRI; + } }; } void PtrState::Merge(const PtrState &Other, bool TopDown) { - Seq = MergeSeqs(Seq, Other.Seq, TopDown); - KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount; + Seq = MergeSeqs(static_cast<Sequence>(Seq), static_cast<Sequence>(Other.Seq), + TopDown); + KnownPositiveRefCount &= Other.KnownPositiveRefCount; // If we're not in a sequence (anymore), drop all associated state. if (Seq == S_None) { @@ -508,22 +553,11 @@ PtrState::Merge(const PtrState &Other, bool TopDown) { // mixing them is unsafe. ClearSequenceProgress(); } else { - // Conservatively merge the ReleaseMetadata information. - if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata) - RRI.ReleaseMetadata = 0; - - RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe; - RRI.IsTailCallRelease = RRI.IsTailCallRelease && - Other.RRI.IsTailCallRelease; - RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end()); - - // Merge the insert point sets. If there are any differences, - // that makes this a partial merge. - Partial = RRI.ReverseInsertPts.size() != Other.RRI.ReverseInsertPts.size(); - for (SmallPtrSet<Instruction *, 2>::const_iterator - I = Other.RRI.ReverseInsertPts.begin(), - E = Other.RRI.ReverseInsertPts.end(); I != E; ++I) - Partial |= RRI.ReverseInsertPts.insert(*I); + // Otherwise merge the other PtrState's RRInfo into our RRInfo. At this + // point, we know that currently we are not partial. Stash whether or not + // the merge operation caused us to undergo a partial merging of reverse + // insertion points. + Partial = RRI.Merge(Other.RRI); } } @@ -556,7 +590,9 @@ namespace { SmallVector<BasicBlock *, 2> Succs; public: - BBState() : TopDownPathCount(0), BottomUpPathCount(0) {} + static const unsigned OverflowOccurredValue; + + BBState() : TopDownPathCount(0), BottomUpPathCount(0) { } typedef MapTy::iterator ptr_iterator; typedef MapTy::const_iterator ptr_const_iterator; @@ -587,14 +623,26 @@ namespace { /// definition. void SetAsExit() { BottomUpPathCount = 1; } + /// Attempt to find the PtrState object describing the top down state for + /// pointer Arg. Return a new initialized PtrState describing the top down + /// state for Arg if we do not find one. PtrState &getPtrTopDownState(const Value *Arg) { return PerPtrTopDown[Arg]; } + /// Attempt to find the PtrState object describing the bottom up state for + /// pointer Arg. Return a new initialized PtrState describing the bottom up + /// state for Arg if we do not find one. PtrState &getPtrBottomUpState(const Value *Arg) { return PerPtrBottomUp[Arg]; } + /// Attempt to find the PtrState object describing the bottom up state for + /// pointer Arg. + ptr_iterator findPtrBottomUpState(const Value *Arg) { + return PerPtrBottomUp.find(Arg); + } + void clearBottomUpPointers() { PerPtrBottomUp.clear(); } @@ -608,27 +656,38 @@ namespace { void MergePred(const BBState &Other); void MergeSucc(const BBState &Other); - /// Return the number of possible unique paths from an entry to an exit + /// Compute the number of possible unique paths from an entry to an exit /// which pass through this block. This is only valid after both the /// top-down and bottom-up traversals are complete. - unsigned GetAllPathCount() const { - assert(TopDownPathCount != 0); - assert(BottomUpPathCount != 0); - return TopDownPathCount * BottomUpPathCount; + /// + /// Returns true if overflow occured. Returns false if overflow did not + /// occur. + bool GetAllPathCountWithOverflow(unsigned &PathCount) const { + if (TopDownPathCount == OverflowOccurredValue || + BottomUpPathCount == OverflowOccurredValue) + return true; + unsigned long long Product = + (unsigned long long)TopDownPathCount*BottomUpPathCount; + // Overflow occured if any of the upper bits of Product are set or if all + // the lower bits of Product are all set. + return (Product >> 32) || + ((PathCount = Product) == OverflowOccurredValue); } // Specialized CFG utilities. typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator; - edge_iterator pred_begin() { return Preds.begin(); } - edge_iterator pred_end() { return Preds.end(); } - edge_iterator succ_begin() { return Succs.begin(); } - edge_iterator succ_end() { return Succs.end(); } + edge_iterator pred_begin() const { return Preds.begin(); } + edge_iterator pred_end() const { return Preds.end(); } + edge_iterator succ_begin() const { return Succs.begin(); } + edge_iterator succ_end() const { return Succs.end(); } void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); } void addPred(BasicBlock *Pred) { Preds.push_back(Pred); } bool isExit() const { return Succs.empty(); } }; + + const unsigned BBState::OverflowOccurredValue = 0xffffffff; } void BBState::InitFromPred(const BBState &Other) { @@ -644,13 +703,25 @@ void BBState::InitFromSucc(const BBState &Other) { /// The top-down traversal uses this to merge information about predecessors to /// form the initial state for a new block. void BBState::MergePred(const BBState &Other) { + if (TopDownPathCount == OverflowOccurredValue) + return; + // Other.TopDownPathCount can be 0, in which case it is either dead or a // loop backedge. Loop backedges are special. TopDownPathCount += Other.TopDownPathCount; + // In order to be consistent, we clear the top down pointers when by adding + // TopDownPathCount becomes OverflowOccurredValue even though "true" overflow + // has not occured. + if (TopDownPathCount == OverflowOccurredValue) { + clearTopDownPointers(); + return; + } + // Check for overflow. If we have overflow, fall back to conservative // behavior. if (TopDownPathCount < Other.TopDownPathCount) { + TopDownPathCount = OverflowOccurredValue; clearTopDownPointers(); return; } @@ -676,13 +747,25 @@ void BBState::MergePred(const BBState &Other) { /// The bottom-up traversal uses this to merge information about successors to /// form the initial state for a new block. void BBState::MergeSucc(const BBState &Other) { + if (BottomUpPathCount == OverflowOccurredValue) + return; + // Other.BottomUpPathCount can be 0, in which case it is either dead or a // loop backedge. Loop backedges are special. BottomUpPathCount += Other.BottomUpPathCount; + // In order to be consistent, we clear the top down pointers when by adding + // BottomUpPathCount becomes OverflowOccurredValue even though "true" overflow + // has not occured. + if (BottomUpPathCount == OverflowOccurredValue) { + clearBottomUpPointers(); + return; + } + // Check for overflow. If we have overflow, fall back to conservative // behavior. if (BottomUpPathCount < Other.BottomUpPathCount) { + BottomUpPathCount = OverflowOccurredValue; clearBottomUpPointers(); return; } @@ -991,25 +1074,14 @@ namespace { class ObjCARCOpt : public FunctionPass { bool Changed; ProvenanceAnalysis PA; + ARCRuntimeEntryPoints EP; + + // This is used to track if a pointer is stored into an alloca. + DenseSet<const Value *> MultiOwnersSet; /// A flag indicating whether this optimization pass should run. bool Run; - /// Declarations for ObjC runtime functions, for use in creating calls to - /// them. These are initialized lazily to avoid cluttering up the Module - /// with unused declarations. - - /// Declaration for ObjC runtime function objc_autoreleaseReturnValue. - Constant *AutoreleaseRVCallee; - /// Declaration for ObjC runtime function objc_release. - Constant *ReleaseCallee; - /// Declaration for ObjC runtime function objc_retain. - Constant *RetainCallee; - /// Declaration for ObjC runtime function objc_retainBlock. - Constant *RetainBlockCallee; - /// Declaration for ObjC runtime function objc_autorelease. - Constant *AutoreleaseCallee; - /// Flags which determine whether each of the interesting runtine functions /// is in fact used in the current function. unsigned UsedInThisFunction; @@ -1032,19 +1104,9 @@ namespace { unsigned ARCAnnotationProvenanceSourceMDKind; #endif // ARC_ANNOATIONS - Constant *getAutoreleaseRVCallee(Module *M); - Constant *getReleaseCallee(Module *M); - Constant *getRetainCallee(Module *M); - Constant *getRetainBlockCallee(Module *M); - Constant *getAutoreleaseCallee(Module *M); - - bool IsRetainBlockOptimizable(const Instruction *Inst); - bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV); void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV, InstructionClass &Class); - bool OptimizeRetainBlockCall(Function &F, Instruction *RetainBlock, - InstructionClass &Class); void OptimizeIndividualCalls(Function &F); void CheckForCFGHazards(const BasicBlock *BB, @@ -1078,9 +1140,9 @@ namespace { MapVector<Value *, RRInfo> &Retains, DenseMap<Value *, RRInfo> &Releases, Module *M, - SmallVector<Instruction *, 4> &NewRetains, - SmallVector<Instruction *, 4> &NewReleases, - SmallVector<Instruction *, 8> &DeadInsts, + SmallVectorImpl<Instruction *> &NewRetains, + SmallVectorImpl<Instruction *> &NewReleases, + SmallVectorImpl<Instruction *> &DeadInsts, RRInfo &RetainsToMove, RRInfo &ReleasesToMove, Value *Arg, @@ -1133,101 +1195,6 @@ void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); } -bool ObjCARCOpt::IsRetainBlockOptimizable(const Instruction *Inst) { - // Without the magic metadata tag, we have to assume this might be an - // objc_retainBlock call inserted to convert a block pointer to an id, - // in which case it really is needed. - if (!Inst->getMetadata(CopyOnEscapeMDKind)) - return false; - - // If the pointer "escapes" (not including being used in a call), - // the copy may be needed. - if (DoesRetainableObjPtrEscape(Inst)) - return false; - - // Otherwise, it's not needed. - return true; -} - -Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) { - if (!AutoreleaseRVCallee) { - LLVMContext &C = M->getContext(); - Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - Type *Params[] = { I8X }; - FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttributeSet Attribute = - AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex, - Attribute::NoUnwind); - AutoreleaseRVCallee = - M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy, - Attribute); - } - return AutoreleaseRVCallee; -} - -Constant *ObjCARCOpt::getReleaseCallee(Module *M) { - if (!ReleaseCallee) { - LLVMContext &C = M->getContext(); - Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; - AttributeSet Attribute = - AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex, - Attribute::NoUnwind); - ReleaseCallee = - M->getOrInsertFunction( - "objc_release", - FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false), - Attribute); - } - return ReleaseCallee; -} - -Constant *ObjCARCOpt::getRetainCallee(Module *M) { - if (!RetainCallee) { - LLVMContext &C = M->getContext(); - Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; - AttributeSet Attribute = - AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex, - Attribute::NoUnwind); - RetainCallee = - M->getOrInsertFunction( - "objc_retain", - FunctionType::get(Params[0], Params, /*isVarArg=*/false), - Attribute); - } - return RetainCallee; -} - -Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) { - if (!RetainBlockCallee) { - LLVMContext &C = M->getContext(); - Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; - // objc_retainBlock is not nounwind because it calls user copy constructors - // which could theoretically throw. - RetainBlockCallee = - M->getOrInsertFunction( - "objc_retainBlock", - FunctionType::get(Params[0], Params, /*isVarArg=*/false), - AttributeSet()); - } - return RetainBlockCallee; -} - -Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) { - if (!AutoreleaseCallee) { - LLVMContext &C = M->getContext(); - Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; - AttributeSet Attribute = - AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex, - Attribute::NoUnwind); - AutoreleaseCallee = - M->getOrInsertFunction( - "objc_autorelease", - FunctionType::get(Params[0], Params, /*isVarArg=*/false), - Attribute); - } - return AutoreleaseCallee; -} - /// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is /// not a return value. Or, if it can be paired with an /// objc_autoreleaseReturnValue, delete the pair and return true. @@ -1281,7 +1248,8 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { "objc_retain since the operand is not a return value.\n" "Old = " << *RetainRV << "\n"); - cast<CallInst>(RetainRV)->setCalledFunction(getRetainCallee(F.getParent())); + Constant *NewDecl = EP.get(ARCRuntimeEntryPoints::EPT_Retain); + cast<CallInst>(RetainRV)->setCalledFunction(NewDecl); DEBUG(dbgs() << "New = " << *RetainRV << "\n"); @@ -1318,8 +1286,8 @@ ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV, "Old = " << *AutoreleaseRV << "\n"); CallInst *AutoreleaseRVCI = cast<CallInst>(AutoreleaseRV); - AutoreleaseRVCI-> - setCalledFunction(getAutoreleaseCallee(F.getParent())); + Constant *NewDecl = EP.get(ARCRuntimeEntryPoints::EPT_Autorelease); + AutoreleaseRVCI->setCalledFunction(NewDecl); AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease. Class = IC_Autorelease; @@ -1327,40 +1295,6 @@ ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV, } -// \brief Attempt to strength reduce objc_retainBlock calls to objc_retain -// calls. -// -// Specifically: If an objc_retainBlock call has the copy_on_escape metadata and -// does not escape (following the rules of block escaping), strength reduce the -// objc_retainBlock to an objc_retain. -// -// TODO: If an objc_retainBlock call is dominated period by a previous -// objc_retainBlock call, strength reduce the objc_retainBlock to an -// objc_retain. -bool -ObjCARCOpt::OptimizeRetainBlockCall(Function &F, Instruction *Inst, - InstructionClass &Class) { - assert(GetBasicInstructionClass(Inst) == Class); - assert(IC_RetainBlock == Class); - - // If we can not optimize Inst, return false. - if (!IsRetainBlockOptimizable(Inst)) - return false; - - Changed = true; - ++NumPeeps; - - DEBUG(dbgs() << "Strength reduced retainBlock => retain.\n"); - DEBUG(dbgs() << "Old: " << *Inst << "\n"); - CallInst *RetainBlock = cast<CallInst>(Inst); - RetainBlock->setCalledFunction(getRetainCallee(F.getParent())); - // Remove copy_on_escape metadata. - RetainBlock->setMetadata(CopyOnEscapeMDKind, 0); - Class = IC_Retain; - DEBUG(dbgs() << "New: " << *Inst << "\n"); - return true; -} - /// Visit each call, one at a time, and make simplifications without doing any /// additional analysis. void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { @@ -1437,15 +1371,6 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { } break; } - case IC_RetainBlock: - // If we strength reduce an objc_retainBlock to an objc_retain, continue - // onto the objc_retain peephole optimizations. Otherwise break. - if (!OptimizeRetainBlockCall(F, Inst, Class)) - break; - // FALLTHROUGH - case IC_Retain: - ++NumRetainsBeforeOpt; - break; case IC_RetainRV: if (OptimizeRetainRVCall(F, Inst)) continue; @@ -1453,9 +1378,6 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { case IC_AutoreleaseRV: OptimizeAutoreleaseRVCall(F, Inst, Class); break; - case IC_Release: - ++NumReleasesBeforeOpt; - break; } // objc_autorelease(x) -> objc_release(x) if x is otherwise unused. @@ -1469,9 +1391,10 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { // Create the declaration lazily. LLVMContext &C = Inst->getContext(); - CallInst *NewCall = - CallInst::Create(getReleaseCallee(F.getParent()), - Call->getArgOperand(0), "", Call); + + Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Release); + CallInst *NewCall = CallInst::Create(Decl, Call->getArgOperand(0), "", + Call); NewCall->setMetadata(ImpreciseReleaseMDKind, MDNode::get(C, None)); DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) " @@ -1639,13 +1562,15 @@ static void CheckForUseCFGHazard(const Sequence SuccSSeq, PtrState &S, bool &SomeSuccHasSame, bool &AllSuccsHaveSame, + bool &NotAllSeqEqualButKnownSafe, bool &ShouldContinue) { switch (SuccSSeq) { case S_CanRelease: { - if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) { + if (!S.IsKnownSafe() && !SuccSRRIKnownSafe) { S.ClearSequenceProgress(); break; } + S.SetCFGHazardAfflicted(true); ShouldContinue = true; break; } @@ -1655,8 +1580,10 @@ static void CheckForUseCFGHazard(const Sequence SuccSSeq, case S_Stop: case S_Release: case S_MovableRelease: - if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) + if (!S.IsKnownSafe() && !SuccSRRIKnownSafe) AllSuccsHaveSame = false; + else + NotAllSeqEqualButKnownSafe = true; break; case S_Retain: llvm_unreachable("bottom-up pointer in retain state!"); @@ -1672,7 +1599,8 @@ static void CheckForCanReleaseCFGHazard(const Sequence SuccSSeq, const bool SuccSRRIKnownSafe, PtrState &S, bool &SomeSuccHasSame, - bool &AllSuccsHaveSame) { + bool &AllSuccsHaveSame, + bool &NotAllSeqEqualButKnownSafe) { switch (SuccSSeq) { case S_CanRelease: SomeSuccHasSame = true; @@ -1681,8 +1609,10 @@ static void CheckForCanReleaseCFGHazard(const Sequence SuccSSeq, case S_Release: case S_MovableRelease: case S_Use: - if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) + if (!S.IsKnownSafe() && !SuccSRRIKnownSafe) AllSuccsHaveSame = false; + else + NotAllSeqEqualButKnownSafe = true; break; case S_Retain: llvm_unreachable("bottom-up pointer in retain state!"); @@ -1718,6 +1648,7 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, const TerminatorInst *TI = cast<TerminatorInst>(&BB->back()); bool SomeSuccHasSame = false; bool AllSuccsHaveSame = true; + bool NotAllSeqEqualButKnownSafe = false; succ_const_iterator SI(TI), SE(TI, false); @@ -1742,24 +1673,24 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, // If we have S_Use or S_CanRelease, perform our check for cfg hazard // checks. - const bool SuccSRRIKnownSafe = SuccS.RRI.KnownSafe; + const bool SuccSRRIKnownSafe = SuccS.IsKnownSafe(); // *NOTE* We do not use Seq from above here since we are allowing for // S.GetSeq() to change while we are visiting basic blocks. switch(S.GetSeq()) { case S_Use: { bool ShouldContinue = false; - CheckForUseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S, - SomeSuccHasSame, AllSuccsHaveSame, + CheckForUseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S, SomeSuccHasSame, + AllSuccsHaveSame, NotAllSeqEqualButKnownSafe, ShouldContinue); if (ShouldContinue) continue; break; } case S_CanRelease: { - CheckForCanReleaseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, - S, SomeSuccHasSame, - AllSuccsHaveSame); + CheckForCanReleaseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S, + SomeSuccHasSame, AllSuccsHaveSame, + NotAllSeqEqualButKnownSafe); break; } case S_Retain: @@ -1774,8 +1705,15 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, // If the state at the other end of any of the successor edges // matches the current state, require all edges to match. This // guards against loops in the middle of a sequence. - if (SomeSuccHasSame && !AllSuccsHaveSame) + if (SomeSuccHasSame && !AllSuccsHaveSame) { S.ClearSequenceProgress(); + } else if (NotAllSeqEqualButKnownSafe) { + // If we would have cleared the state foregoing the fact that we are known + // safe, stop code motion. This is because whether or not it is safe to + // remove RR pairs via KnownSafe is an orthogonal concept to whether we + // are allowed to perform code motion. + S.SetCFGHazardAfflicted(true); + } } } @@ -1812,10 +1750,10 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, Sequence NewSeq = ReleaseMetadata ? S_MovableRelease : S_Release; ANNOTATE_BOTTOMUP(Inst, Arg, S.GetSeq(), NewSeq); S.ResetSequenceProgress(NewSeq); - S.RRI.ReleaseMetadata = ReleaseMetadata; - S.RRI.KnownSafe = S.HasKnownPositiveRefCount(); - S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall(); - S.RRI.Calls.insert(Inst); + S.SetReleaseMetadata(ReleaseMetadata); + S.SetKnownSafe(S.HasKnownPositiveRefCount()); + S.SetTailCallRelease(cast<CallInst>(Inst)->isTailCall()); + S.InsertCall(Inst); S.SetKnownPositiveRefCount(); break; } @@ -1839,14 +1777,14 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, case S_Use: // If OldSeq is not S_Use or OldSeq is S_Use and we are tracking an // imprecise release, clear our reverse insertion points. - if (OldSeq != S_Use || S.RRI.IsTrackingImpreciseReleases()) - S.RRI.ReverseInsertPts.clear(); + if (OldSeq != S_Use || S.IsTrackingImpreciseReleases()) + S.ClearReverseInsertPts(); // FALL THROUGH case S_CanRelease: // Don't do retain+release tracking for IC_RetainRV, because it's // better to let it remain as the first instruction after a call. if (Class != IC_RetainRV) - Retains[Inst] = S.RRI; + Retains[Inst] = S.GetRRInfo(); S.ClearSequenceProgress(); break; case S_None: @@ -1866,6 +1804,28 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, case IC_None: // These are irrelevant. return NestingDetected; + case IC_User: + // If we have a store into an alloca of a pointer we are tracking, the + // pointer has multiple owners implying that we must be more conservative. + // + // This comes up in the context of a pointer being ``KnownSafe''. In the + // presense of a block being initialized, the frontend will emit the + // objc_retain on the original pointer and the release on the pointer loaded + // from the alloca. The optimizer will through the provenance analysis + // realize that the two are related, but since we only require KnownSafe in + // one direction, will match the inner retain on the original pointer with + // the guard release on the original pointer. This is fixed by ensuring that + // in the presense of allocas we only unconditionally remove pointers if + // both our retain and our release are KnownSafe. + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + if (AreAnyUnderlyingObjectsAnAlloca(SI->getPointerOperand())) { + BBState::ptr_iterator I = MyStates.findPtrBottomUpState( + StripPointerCastsAndObjCCalls(SI->getValueOperand())); + if (I != MyStates.bottom_up_ptr_end()) + MultiOwnersSet.insert(I->first); + } + } + break; default: break; } @@ -1908,14 +1868,14 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, if (CanUse(Inst, Ptr, PA, Class)) { DEBUG(dbgs() << "CanUse: Seq: " << Seq << "; " << *Ptr << "\n"); - assert(S.RRI.ReverseInsertPts.empty()); + assert(!S.HasReverseInsertPts()); // If this is an invoke instruction, we're scanning it as part of // one of its successor blocks, since we can't insert code after it // in its own block, and we don't want to split critical edges. if (isa<InvokeInst>(Inst)) - S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt()); + S.InsertReverseInsertPt(BB->getFirstInsertionPt()); else - S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst))); + S.InsertReverseInsertPt(llvm::next(BasicBlock::iterator(Inst))); S.SetSeq(S_Use); ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S_Use); } else if (Seq == S_Release && IsUser(Class)) { @@ -1924,12 +1884,12 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, // Non-movable releases depend on any possible objc pointer use. S.SetSeq(S_Stop); ANNOTATE_BOTTOMUP(Inst, Ptr, S_Release, S_Stop); - assert(S.RRI.ReverseInsertPts.empty()); + assert(!S.HasReverseInsertPts()); // As above; handle invoke specially. if (isa<InvokeInst>(Inst)) - S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt()); + S.InsertReverseInsertPt(BB->getFirstInsertionPt()); else - S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst))); + S.InsertReverseInsertPt(llvm::next(BasicBlock::iterator(Inst))); } break; case S_Stop: @@ -2049,8 +2009,8 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, ANNOTATE_TOPDOWN(Inst, Arg, S.GetSeq(), S_Retain); S.ResetSequenceProgress(S_Retain); - S.RRI.KnownSafe = S.HasKnownPositiveRefCount(); - S.RRI.Calls.insert(Inst); + S.SetKnownSafe(S.HasKnownPositiveRefCount()); + S.InsertCall(Inst); } S.SetKnownPositiveRefCount(); @@ -2073,12 +2033,12 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, case S_Retain: case S_CanRelease: if (OldSeq == S_Retain || ReleaseMetadata != 0) - S.RRI.ReverseInsertPts.clear(); + S.ClearReverseInsertPts(); // FALL THROUGH case S_Use: - S.RRI.ReleaseMetadata = ReleaseMetadata; - S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall(); - Releases[Inst] = S.RRI; + S.SetReleaseMetadata(ReleaseMetadata); + S.SetTailCallRelease(cast<CallInst>(Inst)->isTailCall()); + Releases[Inst] = S.GetRRInfo(); ANNOTATE_TOPDOWN(Inst, Arg, S.GetSeq(), S_None); S.ClearSequenceProgress(); break; @@ -2122,8 +2082,8 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, case S_Retain: S.SetSeq(S_CanRelease); ANNOTATE_TOPDOWN(Inst, Ptr, Seq, S_CanRelease); - assert(S.RRI.ReverseInsertPts.empty()); - S.RRI.ReverseInsertPts.insert(Inst); + assert(!S.HasReverseInsertPts()); + S.InsertReverseInsertPt(Inst); // One call can't cause a transition from S_Retain to S_CanRelease // and S_CanRelease to S_Use. If we've made the first transition, @@ -2350,8 +2310,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg, Instruction *InsertPt = *PI; Value *MyArg = ArgTy == ParamTy ? Arg : new BitCastInst(Arg, ParamTy, "", InsertPt); - CallInst *Call = - CallInst::Create(getRetainCallee(M), MyArg, "", InsertPt); + Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Retain); + CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt); Call->setDoesNotThrow(); Call->setTailCall(); @@ -2364,8 +2324,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg, Instruction *InsertPt = *PI; Value *MyArg = ArgTy == ParamTy ? Arg : new BitCastInst(Arg, ParamTy, "", InsertPt); - CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg, - "", InsertPt); + Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Release); + CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt); // Attach a clang.imprecise_release metadata tag, if appropriate. if (MDNode *M = ReleasesToMove.ReleaseMetadata) Call->setMetadata(ImpreciseReleaseMDKind, M); @@ -2403,17 +2363,20 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> MapVector<Value *, RRInfo> &Retains, DenseMap<Value *, RRInfo> &Releases, Module *M, - SmallVector<Instruction *, 4> &NewRetains, - SmallVector<Instruction *, 4> &NewReleases, - SmallVector<Instruction *, 8> &DeadInsts, + SmallVectorImpl<Instruction *> &NewRetains, + SmallVectorImpl<Instruction *> &NewReleases, + SmallVectorImpl<Instruction *> &DeadInsts, RRInfo &RetainsToMove, RRInfo &ReleasesToMove, Value *Arg, bool KnownSafe, bool &AnyPairsCompletelyEliminated) { // If a pair happens in a region where it is known that the reference count - // is already incremented, we can similarly ignore possible decrements. + // is already incremented, we can similarly ignore possible decrements unless + // we are dealing with a retainable object with multiple provenance sources. bool KnownSafeTD = true, KnownSafeBU = true; + bool MultipleOwners = false; + bool CFGHazardAfflicted = false; // Connect the dots between the top-down-collected RetainsToMove and // bottom-up-collected ReleasesToMove to form sets of related calls. @@ -2432,6 +2395,8 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> assert(It != Retains.end()); const RRInfo &NewRetainRRI = It->second; KnownSafeTD &= NewRetainRRI.KnownSafe; + MultipleOwners = + MultipleOwners || MultiOwnersSet.count(GetObjCArg(NewRetain)); for (SmallPtrSet<Instruction *, 2>::const_iterator LI = NewRetainRRI.Calls.begin(), LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) { @@ -2441,10 +2406,27 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> if (Jt == Releases.end()) return false; const RRInfo &NewRetainReleaseRRI = Jt->second; - assert(NewRetainReleaseRRI.Calls.count(NewRetain)); + + // If the release does not have a reference to the retain as well, + // something happened which is unaccounted for. Do not do anything. + // + // This can happen if we catch an additive overflow during path count + // merging. + if (!NewRetainReleaseRRI.Calls.count(NewRetain)) + return false; + if (ReleasesToMove.Calls.insert(NewRetainRelease)) { - OldDelta -= - BBStates[NewRetainRelease->getParent()].GetAllPathCount(); + + // If we overflow when we compute the path count, don't remove/move + // anything. + const BBState &NRRBBState = BBStates[NewRetainRelease->getParent()]; + unsigned PathCount = BBState::OverflowOccurredValue; + if (NRRBBState.GetAllPathCountWithOverflow(PathCount)) + return false; + assert(PathCount != BBState::OverflowOccurredValue && + "PathCount at this point can not be " + "OverflowOccurredValue."); + OldDelta -= PathCount; // Merge the ReleaseMetadata and IsTailCallRelease values. if (FirstRelease) { @@ -2469,8 +2451,18 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> RE = NewRetainReleaseRRI.ReverseInsertPts.end(); RI != RE; ++RI) { Instruction *RIP = *RI; - if (ReleasesToMove.ReverseInsertPts.insert(RIP)) - NewDelta -= BBStates[RIP->getParent()].GetAllPathCount(); + if (ReleasesToMove.ReverseInsertPts.insert(RIP)) { + // If we overflow when we compute the path count, don't + // remove/move anything. + const BBState &RIPBBState = BBStates[RIP->getParent()]; + PathCount = BBState::OverflowOccurredValue; + if (RIPBBState.GetAllPathCountWithOverflow(PathCount)) + return false; + assert(PathCount != BBState::OverflowOccurredValue && + "PathCount at this point can not be " + "OverflowOccurredValue."); + NewDelta -= PathCount; + } } NewReleases.push_back(NewRetainRelease); } @@ -2488,6 +2480,7 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> assert(It != Releases.end()); const RRInfo &NewReleaseRRI = It->second; KnownSafeBU &= NewReleaseRRI.KnownSafe; + CFGHazardAfflicted |= NewReleaseRRI.CFGHazardAfflicted; for (SmallPtrSet<Instruction *, 2>::const_iterator LI = NewReleaseRRI.Calls.begin(), LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) { @@ -2497,10 +2490,25 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> if (Jt == Retains.end()) return false; const RRInfo &NewReleaseRetainRRI = Jt->second; - assert(NewReleaseRetainRRI.Calls.count(NewRelease)); + + // If the retain does not have a reference to the release as well, + // something happened which is unaccounted for. Do not do anything. + // + // This can happen if we catch an additive overflow during path count + // merging. + if (!NewReleaseRetainRRI.Calls.count(NewRelease)) + return false; + if (RetainsToMove.Calls.insert(NewReleaseRetain)) { - unsigned PathCount = - BBStates[NewReleaseRetain->getParent()].GetAllPathCount(); + // If we overflow when we compute the path count, don't remove/move + // anything. + const BBState &NRRBBState = BBStates[NewReleaseRetain->getParent()]; + unsigned PathCount = BBState::OverflowOccurredValue; + if (NRRBBState.GetAllPathCountWithOverflow(PathCount)) + return false; + assert(PathCount != BBState::OverflowOccurredValue && + "PathCount at this point can not be " + "OverflowOccurredValue."); OldDelta += PathCount; OldCount += PathCount; @@ -2512,7 +2520,16 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> RI != RE; ++RI) { Instruction *RIP = *RI; if (RetainsToMove.ReverseInsertPts.insert(RIP)) { - PathCount = BBStates[RIP->getParent()].GetAllPathCount(); + // If we overflow when we compute the path count, don't + // remove/move anything. + const BBState &RIPBBState = BBStates[RIP->getParent()]; + + PathCount = BBState::OverflowOccurredValue; + if (RIPBBState.GetAllPathCountWithOverflow(PathCount)) + return false; + assert(PathCount != BBState::OverflowOccurredValue && + "PathCount at this point can not be " + "OverflowOccurredValue."); NewDelta += PathCount; NewCount += PathCount; } @@ -2525,9 +2542,12 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> if (NewRetains.empty()) break; } - // If the pointer is known incremented or nested, we can safely delete the - // pair regardless of what's between them. - if (KnownSafeTD || KnownSafeBU) { + // If the pointer is known incremented in 1 direction and we do not have + // MultipleOwners, we can safely remove the retain/releases. Otherwise we need + // to be known safe in both directions. + bool UnconditionallySafe = (KnownSafeTD && KnownSafeBU) || + ((KnownSafeTD || KnownSafeBU) && !MultipleOwners); + if (UnconditionallySafe) { RetainsToMove.ReverseInsertPts.clear(); ReleasesToMove.ReverseInsertPts.clear(); NewCount = 0; @@ -2538,6 +2558,14 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> // less aggressive solution which is. if (NewDelta != 0) return false; + + // At this point, we are not going to remove any RR pairs, but we still are + // able to move RR pairs. If one of our pointers is afflicted with + // CFGHazards, we cannot perform such code motion so exit early. + const bool WillPerformCodeMotion = RetainsToMove.ReverseInsertPts.size() || + ReleasesToMove.ReverseInsertPts.size(); + if (CFGHazardAfflicted && WillPerformCodeMotion) + return false; } // Determine whether the original call points are balanced in the retain and @@ -2685,9 +2713,8 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { Changed = true; // If the load has a builtin retain, insert a plain retain for it. if (Class == IC_LoadWeakRetained) { - CallInst *CI = - CallInst::Create(getRetainCallee(F.getParent()), EarlierCall, - "", Call); + Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Retain); + CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call); CI->setTailCall(); } // Zap the fully redundant load. @@ -2715,9 +2742,8 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { Changed = true; // If the load has a builtin retain, insert a plain retain for it. if (Class == IC_LoadWeakRetained) { - CallInst *CI = - CallInst::Create(getRetainCallee(F.getParent()), EarlierCall, - "", Call); + Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Retain); + CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call); CI->setTailCall(); } // Zap the fully redundant load. @@ -2801,23 +2827,29 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { /// Identify program paths which execute sequences of retains and releases which /// can be eliminated. bool ObjCARCOpt::OptimizeSequences(Function &F) { - /// Releases, Retains - These are used to store the results of the main flow - /// analysis. These use Value* as the key instead of Instruction* so that the - /// map stays valid when we get around to rewriting code and calls get - /// replaced by arguments. + // Releases, Retains - These are used to store the results of the main flow + // analysis. These use Value* as the key instead of Instruction* so that the + // map stays valid when we get around to rewriting code and calls get + // replaced by arguments. DenseMap<Value *, RRInfo> Releases; MapVector<Value *, RRInfo> Retains; - /// This is used during the traversal of the function to track the - /// states for each identified object at each block. + // This is used during the traversal of the function to track the + // states for each identified object at each block. DenseMap<const BasicBlock *, BBState> BBStates; // Analyze the CFG of the function, and all instructions. bool NestingDetected = Visit(F, BBStates, Retains, Releases); // Transform. - return PerformCodePlacement(BBStates, Retains, Releases, F.getParent()) && - NestingDetected; + bool AnyPairsCompletelyEliminated = PerformCodePlacement(BBStates, Retains, + Releases, + F.getParent()); + + // Cleanup. + MultiOwnersSet.clear(); + + return AnyPairsCompletelyEliminated && NestingDetected; } /// Check if there is a dependent call earlier that does not have anything in @@ -3025,12 +3057,8 @@ bool ObjCARCOpt::doInitialization(Module &M) { // they are not, because they return their argument value. And objc_release // calls finalizers which can have arbitrary side effects. - // These are initialized lazily. - AutoreleaseRVCallee = 0; - ReleaseCallee = 0; - RetainCallee = 0; - RetainBlockCallee = 0; - AutoreleaseCallee = 0; + // Initialize our runtime entry point cache. + EP.Initialize(&M); return false; } @@ -3050,6 +3078,12 @@ bool ObjCARCOpt::runOnFunction(Function &F) { PA.setAA(&getAnalysis<AliasAnalysis>()); +#ifndef NDEBUG + if (AreStatisticsEnabled()) { + GatherStatistics(F, false); + } +#endif + // This pass performs several distinct transformations. As a compile-time aid // when compiling code that isn't ObjC, skip these if the relevant ObjC // library functions aren't declared. diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCUtil.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCUtil.cpp index 03e12d4..53c077e 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCUtil.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCUtil.cpp @@ -1,4 +1,4 @@ -//===- ObjCARCUtil.cpp - ObjC ARC Optimization --------*- mode: c++ -*-----===// +//===- ObjCARCUtil.cpp - ObjC ARC Optimization ----------------------------===// // // The LLVM Compiler Infrastructure // @@ -112,6 +112,8 @@ InstructionClass llvm::objcarc::GetFunctionClass(const Function *F) { .Case("objc_retain_autorelease", IC_FusedRetainAutorelease) .Case("objc_retainAutorelease", IC_FusedRetainAutorelease) .Case("objc_retainAutoreleaseReturnValue",IC_FusedRetainAutoreleaseRV) + .Case("objc_sync_enter", IC_User) + .Case("objc_sync_exit", IC_User) .Default(IC_CallOrUser); // Argument is i8** diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h index ec449fd8e..a13fb9e 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h @@ -1,4 +1,4 @@ -//===- ProvenanceAnalysis.h - ObjC ARC Optimization ---*- mode: c++ -*-----===// +//===- ProvenanceAnalysis.h - ObjC ARC Optimization ---*- C++ -*-----------===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp index a097308..a3eb07a9 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -83,7 +83,7 @@ bool ADCE::runOnFunction(Function& F) { I->dropAllReferences(); } - for (SmallVector<Instruction*, 1024>::iterator I = worklist.begin(), + for (SmallVectorImpl<Instruction *>::iterator I = worklist.begin(), E = worklist.end(); I != E; ++I) { ++NumRemoved; (*I)->eraseFromParent(); diff --git a/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp deleted file mode 100644 index e755008..0000000 --- a/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp +++ /dev/null @@ -1,152 +0,0 @@ -//===-- BasicBlockPlacement.cpp - Basic Block Code Layout optimization ----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements a very simple profile guided basic block placement -// algorithm. The idea is to put frequently executed blocks together at the -// start of the function, and hopefully increase the number of fall-through -// conditional branches. If there is no profile information for a particular -// function, this pass basically orders blocks in depth-first order -// -// The algorithm implemented here is basically "Algo1" from "Profile Guided Code -// Positioning" by Pettis and Hansen, except that it uses basic block counts -// instead of edge counts. This should be improved in many ways, but is very -// simple for now. -// -// Basically we "place" the entry block, then loop over all successors in a DFO, -// placing the most frequently executed successor until we run out of blocks. I -// told you this was _extremely_ simplistic. :) This is also much slower than it -// could be. When it becomes important, this pass will be rewritten to use a -// better algorithm, and then we can worry about efficiency. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "block-placement" -#include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/ProfileInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/Pass.h" -#include "llvm/Support/CFG.h" -#include <set> -using namespace llvm; - -STATISTIC(NumMoved, "Number of basic blocks moved"); - -namespace { - struct BlockPlacement : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - BlockPlacement() : FunctionPass(ID) { - initializeBlockPlacementPass(*PassRegistry::getPassRegistry()); - } - - virtual bool runOnFunction(Function &F); - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesCFG(); - AU.addRequired<ProfileInfo>(); - //AU.addPreserved<ProfileInfo>(); // Does this work? - } - private: - /// PI - The profile information that is guiding us. - /// - ProfileInfo *PI; - - /// NumMovedBlocks - Every time we move a block, increment this counter. - /// - unsigned NumMovedBlocks; - - /// PlacedBlocks - Every time we place a block, remember it so we don't get - /// into infinite loops. - std::set<BasicBlock*> PlacedBlocks; - - /// InsertPos - This an iterator to the next place we want to insert a - /// block. - Function::iterator InsertPos; - - /// PlaceBlocks - Recursively place the specified blocks and any unplaced - /// successors. - void PlaceBlocks(BasicBlock *BB); - }; -} - -char BlockPlacement::ID = 0; -INITIALIZE_PASS_BEGIN(BlockPlacement, "block-placement", - "Profile Guided Basic Block Placement", false, false) -INITIALIZE_AG_DEPENDENCY(ProfileInfo) -INITIALIZE_PASS_END(BlockPlacement, "block-placement", - "Profile Guided Basic Block Placement", false, false) - -FunctionPass *llvm::createBlockPlacementPass() { return new BlockPlacement(); } - -bool BlockPlacement::runOnFunction(Function &F) { - PI = &getAnalysis<ProfileInfo>(); - - NumMovedBlocks = 0; - InsertPos = F.begin(); - - // Recursively place all blocks. - PlaceBlocks(F.begin()); - - PlacedBlocks.clear(); - NumMoved += NumMovedBlocks; - return NumMovedBlocks != 0; -} - - -/// PlaceBlocks - Recursively place the specified blocks and any unplaced -/// successors. -void BlockPlacement::PlaceBlocks(BasicBlock *BB) { - assert(!PlacedBlocks.count(BB) && "Already placed this block!"); - PlacedBlocks.insert(BB); - - // Place the specified block. - if (&*InsertPos != BB) { - // Use splice to move the block into the right place. This avoids having to - // remove the block from the function then readd it, which causes a bunch of - // symbol table traffic that is entirely pointless. - Function::BasicBlockListType &Blocks = BB->getParent()->getBasicBlockList(); - Blocks.splice(InsertPos, Blocks, BB); - - ++NumMovedBlocks; - } else { - // This block is already in the right place, we don't have to do anything. - ++InsertPos; - } - - // Keep placing successors until we run out of ones to place. Note that this - // loop is very inefficient (N^2) for blocks with many successors, like switch - // statements. FIXME! - while (1) { - // Okay, now place any unplaced successors. - succ_iterator SI = succ_begin(BB), E = succ_end(BB); - - // Scan for the first unplaced successor. - for (; SI != E && PlacedBlocks.count(*SI); ++SI) - /*empty*/; - if (SI == E) return; // No more successors to place. - - double MaxExecutionCount = PI->getExecutionCount(*SI); - BasicBlock *MaxSuccessor = *SI; - - // Scan for more frequently executed successors - for (; SI != E; ++SI) - if (!PlacedBlocks.count(*SI)) { - double Count = PI->getExecutionCount(*SI); - if (Count > MaxExecutionCount || - // Prefer to not disturb the code. - (Count == MaxExecutionCount && *SI == &*InsertPos)) { - MaxExecutionCount = Count; - MaxSuccessor = *SI; - } - } - - // Now that we picked the maximally executed successor, place it. - PlaceBlocks(MaxSuccessor); - } -} diff --git a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp index f0d29c8..007e9b7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -22,7 +22,6 @@ #include "llvm/Analysis/DominatorInternals.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/ProfileInfo.h" #include "llvm/Assembly/Writer.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -76,10 +75,10 @@ namespace { class CodeGenPrepare : public FunctionPass { /// TLI - Keep a pointer of a TargetLowering to consult for determining /// transformation profitability. + const TargetMachine *TM; const TargetLowering *TLI; const TargetLibraryInfo *TLInfo; DominatorTree *DT; - ProfileInfo *PFI; /// CurInstIterator - As we scan instructions optimizing them, this is the /// next instruction to optimize. Xforms that can invalidate this should @@ -100,8 +99,8 @@ namespace { public: static char ID; // Pass identification, replacement for typeid - explicit CodeGenPrepare(const TargetLowering *tli = 0) - : FunctionPass(ID), TLI(tli) { + explicit CodeGenPrepare(const TargetMachine *TM = 0) + : FunctionPass(ID), TM(TM), TLI(0) { initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F); @@ -110,7 +109,6 @@ namespace { virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<DominatorTree>(); - AU.addPreserved<ProfileInfo>(); AU.addRequired<TargetLibraryInfo>(); } @@ -139,17 +137,17 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_PASS_END(CodeGenPrepare, "codegenprepare", "Optimize for code generation", false, false) -FunctionPass *llvm::createCodeGenPreparePass(const TargetLowering *TLI) { - return new CodeGenPrepare(TLI); +FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) { + return new CodeGenPrepare(TM); } bool CodeGenPrepare::runOnFunction(Function &F) { bool EverMadeChange = false; ModifiedDT = false; + if (TM) TLI = TM->getTargetLowering(); TLInfo = &getAnalysis<TargetLibraryInfo>(); DT = getAnalysisIfAvailable<DominatorTree>(); - PFI = getAnalysisIfAvailable<ProfileInfo>(); OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); @@ -205,7 +203,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) { SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB)); DeleteDeadBlock(BB); - + for (SmallVectorImpl<BasicBlock*>::iterator II = Successors.begin(), IE = Successors.end(); II != IE; ++II) if (pred_begin(*II) == pred_end(*II)) @@ -440,10 +438,6 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) { DT->changeImmediateDominator(DestBB, NewIDom); DT->eraseNode(BB); } - if (PFI) { - PFI->replaceAllUses(BB, DestBB); - PFI->removeEdge(ProfileInfo::getEdge(BB, DestBB)); - } BB->eraseFromParent(); ++NumBlocksElim; @@ -830,7 +824,7 @@ struct ExtAddrMode : public TargetLowering::AddrMode { ExtAddrMode() : BaseReg(0), ScaledReg(0) {} void print(raw_ostream &OS) const; void dump() const; - + bool operator==(const ExtAddrMode& O) const { return (BaseReg == O.BaseReg) && (ScaledReg == O.ScaledReg) && (BaseGV == O.BaseGV) && (BaseOffs == O.BaseOffs) && @@ -838,10 +832,12 @@ struct ExtAddrMode : public TargetLowering::AddrMode { } }; +#ifndef NDEBUG static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) { AM.print(OS); return OS; } +#endif void ExtAddrMode::print(raw_ostream &OS) const { bool NeedPlus = false; @@ -866,7 +862,6 @@ void ExtAddrMode::print(raw_ostream &OS) const { OS << (NeedPlus ? " + " : "") << Scale << "*"; WriteAsOperand(OS, ScaledReg, /*PrintType=*/false); - NeedPlus = true; } OS << ']'; @@ -891,16 +886,16 @@ class AddressingModeMatcher { /// the memory instruction that we're computing this address for. Type *AccessTy; Instruction *MemoryInst; - + /// AddrMode - This is the addressing mode that we're building up. This is /// part of the return value of this addressing mode matching stuff. ExtAddrMode &AddrMode; - + /// IgnoreProfitability - This is set to true when we should not do /// profitability checks. When true, IsProfitableToFoldIntoAddressingMode /// always returns true. bool IgnoreProfitability; - + AddressingModeMatcher(SmallVectorImpl<Instruction*> &AMI, const TargetLowering &T, Type *AT, Instruction *MI, ExtAddrMode &AM) @@ -908,7 +903,7 @@ class AddressingModeMatcher { IgnoreProfitability = false; } public: - + /// Match - Find the maximal addressing mode that a load/store of V can fold, /// give an access type of AccessTy. This returns a list of involved /// instructions in AddrModeInsts. @@ -918,7 +913,7 @@ public: const TargetLowering &TLI) { ExtAddrMode Result; - bool Success = + bool Success = AddressingModeMatcher(AddrModeInsts, TLI, AccessTy, MemoryInst, Result).MatchAddr(V, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); @@ -943,11 +938,11 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, // mode. Just process that directly. if (Scale == 1) return MatchAddr(ScaleReg, Depth); - + // If the scale is 0, it takes nothing to add this. if (Scale == 0) return true; - + // If we already have a scale of this value, we can add to it, otherwise, we // need an available scale field. if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg) @@ -966,7 +961,7 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, // It was legal, so commit it. AddrMode = TestAddrMode; - + // Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now // to see if ScaleReg is actually X+C. If so, we can turn this into adding // X*Scale + C*Scale to addr mode. @@ -975,7 +970,7 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) { TestAddrMode.ScaledReg = AddLHS; TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale; - + // If this addressing mode is legal, commit it and remember that we folded // this instruction. if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) { @@ -1026,7 +1021,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, unsigned Depth) { // Avoid exponential behavior on extremely deep expression trees. if (Depth >= 5) return false; - + switch (Opcode) { case Instruction::PtrToInt: // PtrToInt is always a noop, as we know that the int type is pointer sized. @@ -1034,7 +1029,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, case Instruction::IntToPtr: // This inttoptr is a no-op if the integer type is pointer sized. if (TLI.getValueType(AddrInst->getOperand(0)->getType()) == - TLI.getPointerTy()) + TLI.getPointerTy(AddrInst->getType()->getPointerAddressSpace())) return MatchAddr(AddrInst->getOperand(0), Depth); return false; case Instruction::BitCast: @@ -1055,16 +1050,16 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, if (MatchAddr(AddrInst->getOperand(1), Depth+1) && MatchAddr(AddrInst->getOperand(0), Depth+1)) return true; - + // Restore the old addr mode info. AddrMode = BackupAddrMode; AddrModeInsts.resize(OldSize); - + // Otherwise this was over-aggressive. Try merging in the LHS then the RHS. if (MatchAddr(AddrInst->getOperand(0), Depth+1) && MatchAddr(AddrInst->getOperand(1), Depth+1)) return true; - + // Otherwise we definitely can't merge the ADD in. AddrMode = BackupAddrMode; AddrModeInsts.resize(OldSize); @@ -1081,7 +1076,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, int64_t Scale = RHS->getSExtValue(); if (Opcode == Instruction::Shl) Scale = 1LL << Scale; - + return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth); } case Instruction::GetElementPtr: { @@ -1089,7 +1084,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, // one variable offset. int VariableOperand = -1; unsigned VariableScale = 0; - + int64_t ConstantOffset = 0; const DataLayout *TD = TLI.getDataLayout(); gep_type_iterator GTI = gep_type_begin(AddrInst); @@ -1107,14 +1102,14 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, // We only allow one variable index at the moment. if (VariableOperand != -1) return false; - + // Remember the variable index. VariableOperand = i; VariableScale = TypeSize; } } } - + // A common case is for the GEP to only do a constant offset. In this case, // just add it to the disp field and check validity. if (VariableOperand == -1) { @@ -1208,7 +1203,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { AddrModeInsts.push_back(I); return true; } - + // It isn't profitable to do this, roll back. //cerr << "NOT FOLDING: " << *I; AddrMode = BackupAddrMode; @@ -1254,7 +1249,7 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI)); for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; - + // Compute the constraint code and ConstraintType to use. TLI.ComputeConstraintToUse(OpInfo, SDValue()); @@ -1279,7 +1274,7 @@ static bool FindAllMemoryUses(Instruction *I, // If we already considered this instruction, we're done. if (!ConsideredInsts.insert(I)) return false; - + // If this is an obviously unfoldable instruction, bail out. if (!MightBeFoldableInst(I)) return true; @@ -1293,24 +1288,24 @@ static bool FindAllMemoryUses(Instruction *I, MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo())); continue; } - + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { unsigned opNo = UI.getOperandNo(); if (opNo == 0) return true; // Storing addr, not into addr. MemoryUses.push_back(std::make_pair(SI, opNo)); continue; } - + if (CallInst *CI = dyn_cast<CallInst>(U)) { InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue()); if (!IA) return true; - + // If this is a memory operand, we're cool, otherwise bail out. if (!IsOperandAMemoryOperand(CI, IA, I, TLI)) return true; continue; } - + if (FindAllMemoryUses(cast<Instruction>(U), MemoryUses, ConsideredInsts, TLI)) return true; @@ -1328,17 +1323,17 @@ bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, // If Val is either of the known-live values, we know it is live! if (Val == 0 || Val == KnownLive1 || Val == KnownLive2) return true; - + // All values other than instructions and arguments (e.g. constants) are live. if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true; - + // If Val is a constant sized alloca in the entry block, it is live, this is // true because it is just a reference to the stack/frame pointer, which is // live for the whole function. if (AllocaInst *AI = dyn_cast<AllocaInst>(Val)) if (AI->isStaticAlloca()) return true; - + // Check to see if this value is already used in the memory instruction's // block. If so, it's already live into the block at the very least, so we // can reasonably fold it. @@ -1370,7 +1365,7 @@ bool AddressingModeMatcher:: IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, ExtAddrMode &AMAfter) { if (IgnoreProfitability) return true; - + // AMBefore is the addressing mode before this instruction was folded into it, // and AMAfter is the addressing mode after the instruction was folded. Get // the set of registers referenced by AMAfter and subtract out those @@ -1381,7 +1376,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, // BaseReg and ScaleReg (global addresses are always available, as are any // folded immediates). Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg; - + // If the BaseReg or ScaledReg was referenced by the previous addrmode, their // lifetime wasn't extended by adding this instruction. if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg)) @@ -1402,7 +1397,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, SmallPtrSet<Instruction*, 16> ConsideredInsts; if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI)) return false; // Has a non-memory, non-foldable use! - + // Now that we know that all uses of this instruction are part of a chain of // computation involving only operations that could theoretically be folded // into a memory use, loop over each of these uses and see if they could @@ -1411,15 +1406,14 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) { Instruction *User = MemoryUses[i].first; unsigned OpNo = MemoryUses[i].second; - + // Get the access type of this use. If the use isn't a pointer, we don't // know what it accesses. Value *Address = User->getOperand(OpNo); if (!Address->getType()->isPointerTy()) return false; - Type *AddressAccessTy = - cast<PointerType>(Address->getType())->getElementType(); - + Type *AddressAccessTy = Address->getType()->getPointerElementType(); + // Do a match against the root of this address, ignoring profitability. This // will tell us if the addressing mode for the memory operation will // *actually* cover the shared instruction. @@ -1434,10 +1428,10 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(), I) == MatchedAddrModeInsts.end()) return false; - + MatchedAddrModeInsts.clear(); } - + return true; } @@ -1572,9 +1566,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, } else { DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " << *MemoryInst); - Type *IntPtrTy = - TLI->getDataLayout()->getIntPtrType(AccessTy->getContext()); - + Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType()); Value *Result = 0; // Start with the base register. Do this first so that subsequent address @@ -1893,7 +1885,8 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { // It is possible for very late stage optimizations (such as SimplifyCFG) // to introduce PHI nodes too late to be cleaned up. If we detect such a // trivial PHI, go ahead and zap it here. - if (Value *V = SimplifyInstruction(P)) { + if (Value *V = SimplifyInstruction(P, TLI ? TLI->getDataLayout() : 0, + TLInfo, DT)) { P->replaceAllUsesWith(V); P->eraseFromParent(); ++NumPHIsElim; diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 3c08634..5266894 100644 --- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -72,11 +72,6 @@ namespace { } namespace llvm { -// SimpleValue is POD. -template<> struct isPodLike<SimpleValue> { - static const bool value = true; -}; - template<> struct DenseMapInfo<SimpleValue> { static inline SimpleValue getEmptyKey() { return DenseMapInfo<Instruction*>::getEmptyKey(); @@ -220,11 +215,6 @@ namespace { } namespace llvm { - // CallValue is POD. - template<> struct isPodLike<CallValue> { - static const bool value = true; - }; - template<> struct DenseMapInfo<CallValue> { static inline CallValue getEmptyKey() { return DenseMapInfo<Instruction*>::getEmptyKey(); diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp new file mode 100644 index 0000000..e7de07f --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -0,0 +1,79 @@ +//===- FlattenCFGPass.cpp - CFG Flatten Pass ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements flattening of CFG. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "flattencfg" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Pass.h" +#include "llvm/Support/CFG.h" +#include "llvm/Transforms/Utils/Local.h" +using namespace llvm; + +namespace { +struct FlattenCFGPass : public FunctionPass { + static char ID; // Pass identification, replacement for typeid +public: + FlattenCFGPass() : FunctionPass(ID) { + initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F); + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<AliasAnalysis>(); + } + +private: + AliasAnalysis *AA; +}; +} + +char FlattenCFGPass::ID = 0; +INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false, + false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false, + false) + +// Public interface to the FlattenCFG pass +FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); } + +/// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function, +/// iterating until no more changes are made. +static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) { + bool Changed = false; + bool LocalChange = true; + while (LocalChange) { + LocalChange = false; + + // Loop over all of the basic blocks and remove them if they are unneeded... + // + for (Function::iterator BBIt = F.begin(); BBIt != F.end();) { + if (FlattenCFG(BBIt++, AA)) { + LocalChange = true; + } + } + Changed |= LocalChange; + } + return Changed; +} + +bool FlattenCFGPass::runOnFunction(Function &F) { + AA = &getAnalysis<AliasAnalysis>(); + bool EverChanged = false; + // iterativelyFlattenCFG can make some blocks dead. + while (iterativelyFlattenCFG(F, AA)) { + removeUnreachableBlocks(F); + EverChanged = true; + } + return EverChanged; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp index f350b9b..6af269d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp @@ -21,8 +21,10 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -45,6 +47,7 @@ #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" +#include <vector> using namespace llvm; using namespace PatternMatch; @@ -505,7 +508,9 @@ namespace { enum ValType { SimpleVal, // A simple offsetted value that is accessed. LoadVal, // A value produced by a load. - MemIntrin // A memory intrinsic which is loaded from. + MemIntrin, // A memory intrinsic which is loaded from. + UndefVal // A UndefValue representing a value from dead block (which + // is not yet physically removed from the CFG). }; /// V - The value that is live out of the block. @@ -543,10 +548,20 @@ namespace { Res.Offset = Offset; return Res; } - + + static AvailableValueInBlock getUndef(BasicBlock *BB) { + AvailableValueInBlock Res; + Res.BB = BB; + Res.Val.setPointer(0); + Res.Val.setInt(UndefVal); + Res.Offset = 0; + return Res; + } + bool isSimpleValue() const { return Val.getInt() == SimpleVal; } bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; } bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; } + bool isUndefValue() const { return Val.getInt() == UndefVal; } Value *getSimpleValue() const { assert(isSimpleValue() && "Wrong accessor"); @@ -574,6 +589,7 @@ namespace { DominatorTree *DT; const DataLayout *TD; const TargetLibraryInfo *TLI; + SetVector<BasicBlock *> DeadBlocks; ValueTable VN; @@ -692,9 +708,13 @@ namespace { void cleanupGlobalSets(); void verifyRemoved(const Instruction *I) const; bool splitCriticalEdges(); + BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ); unsigned replaceAllDominatedUsesWith(Value *From, Value *To, const BasicBlockEdge &Root); bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root); + bool processFoldableCondBr(BranchInst *BI); + void addDeadBlock(BasicBlock *BB); + void assignValNumForDeadCode(); }; char GVN::ID = 0; @@ -1068,14 +1088,15 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, if (Offset == -1) return Offset; + unsigned AS = Src->getType()->getPointerAddressSpace(); // Otherwise, see if we can constant fold a load from the constant with the // offset applied as appropriate. Src = ConstantExpr::getBitCast(Src, - llvm::Type::getInt8PtrTy(Src->getContext())); + Type::getInt8PtrTy(Src->getContext(), AS)); Constant *OffsetCst = ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); - Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); + Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); if (ConstantFoldLoadFromConstPtr(Src, &TD)) return Offset; return -1; @@ -1152,7 +1173,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *DestPTy = IntegerType::get(LoadTy->getContext(), NewLoadSize*8); DestPTy = PointerType::get(DestPTy, - cast<PointerType>(PtrVal->getType())->getAddressSpace()); + PtrVal->getType()->getPointerAddressSpace()); Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc()); PtrVal = Builder.CreateBitCast(PtrVal, DestPTy); LoadInst *NewLoad = Builder.CreateLoad(PtrVal); @@ -1227,15 +1248,16 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, // Otherwise, this is a memcpy/memmove from a constant global. MemTransferInst *MTI = cast<MemTransferInst>(SrcInst); Constant *Src = cast<Constant>(MTI->getSource()); + unsigned AS = Src->getType()->getPointerAddressSpace(); // Otherwise, see if we can constant fold a load from the constant with the // offset applied as appropriate. Src = ConstantExpr::getBitCast(Src, - llvm::Type::getInt8PtrTy(Src->getContext())); + Type::getInt8PtrTy(Src->getContext(), AS)); Constant *OffsetCst = - ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); + ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); - Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); + Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); return ConstantFoldLoadFromConstPtr(Src, &TD); } @@ -1250,8 +1272,10 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, // just use the dominating value directly. if (ValuesPerBlock.size() == 1 && gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB, - LI->getParent())) + LI->getParent())) { + assert(!ValuesPerBlock[0].isUndefValue() && "Dead BB dominate this block"); return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn); + } // Otherwise, we have to construct SSA form. SmallVector<PHINode*, 8> NewPHIs; @@ -1321,7 +1345,7 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c << *getCoercedLoadValue() << '\n' << *Res << '\n' << "\n\n\n"); } - } else { + } else if (isMemIntrinValue()) { const DataLayout *TD = gvn.getDataLayout(); assert(TD && "Need target data to handle type mismatch case"); Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, @@ -1329,6 +1353,10 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset << " " << *getMemIntrinValue() << '\n' << *Res << '\n' << "\n\n\n"); + } else { + assert(isUndefValue() && "Should be UndefVal"); + DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";); + return UndefValue::get(LoadTy); } return Res; } @@ -1352,6 +1380,13 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, BasicBlock *DepBB = Deps[i].getBB(); MemDepResult DepInfo = Deps[i].getResult(); + if (DeadBlocks.count(DepBB)) { + // Dead dependent mem-op disguise as a load evaluating the same value + // as the load in question. + ValuesPerBlock.push_back(AvailableValueInBlock::getUndef(DepBB)); + continue; + } + if (!DepInfo.isDef() && !DepInfo.isClobber()) { UnavailableBlocks.push_back(DepBB); continue; @@ -1513,7 +1548,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) FullyAvailableBlocks[UnavailableBlocks[i]] = false; - SmallVector<std::pair<TerminatorInst*, unsigned>, 4> NeedToSplit; + SmallVector<BasicBlock *, 4> CriticalEdgePred; for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); PI != E; ++PI) { BasicBlock *Pred = *PI; @@ -1536,20 +1571,14 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, return false; } - unsigned SuccNum = GetSuccessorNumber(Pred, LoadBB); - NeedToSplit.push_back(std::make_pair(Pred->getTerminator(), SuccNum)); + CriticalEdgePred.push_back(Pred); } } - if (!NeedToSplit.empty()) { - toSplit.append(NeedToSplit.begin(), NeedToSplit.end()); - return false; - } - // Decide whether PRE is profitable for this load. unsigned NumUnavailablePreds = PredLoads.size(); assert(NumUnavailablePreds != 0 && - "Fully available value should be eliminated above!"); + "Fully available value should already be eliminated!"); // If this load is unavailable in multiple predecessors, reject it. // FIXME: If we could restructure the CFG, we could make a common pred with @@ -1558,6 +1587,17 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (NumUnavailablePreds != 1) return false; + // Split critical edges, and update the unavailable predecessors accordingly. + for (SmallVectorImpl<BasicBlock *>::iterator I = CriticalEdgePred.begin(), + E = CriticalEdgePred.end(); I != E; I++) { + BasicBlock *OrigPred = *I; + BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB); + PredLoads.erase(OrigPred); + PredLoads[NewPred] = 0; + DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->" + << LoadBB->getName() << '\n'); + } + // Check if the load can safely be moved to all the unavailable predecessors. bool CanDoPRE = true; SmallVector<Instruction*, 8> NewInsts; @@ -1594,7 +1634,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (MD) MD->removeInstruction(I); I->eraseFromParent(); } - return false; + // HINT:Don't revert the edge-splitting as following transformation may + // also need to split these critial edges. + return !CriticalEdgePred.empty(); } // Okay, we can eliminate this load by inserting a reload in the predecessor @@ -2181,11 +2223,13 @@ bool GVN::processInstruction(Instruction *I) { // For conditional branches, we can perform simple conditional propagation on // the condition value itself. if (BranchInst *BI = dyn_cast<BranchInst>(I)) { - if (!BI->isConditional() || isa<Constant>(BI->getCondition())) + if (!BI->isConditional()) return false; - Value *BranchCond = BI->getCondition(); + if (isa<Constant>(BI->getCondition())) + return processFoldableCondBr(BI); + Value *BranchCond = BI->getCondition(); BasicBlock *TrueSucc = BI->getSuccessor(0); BasicBlock *FalseSucc = BI->getSuccessor(1); // Avoid multiple edges early. @@ -2297,25 +2341,30 @@ bool GVN::runOnFunction(Function& F) { while (ShouldContinue) { DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n"); ShouldContinue = iterateOnFunction(F); - if (splitCriticalEdges()) - ShouldContinue = true; Changed |= ShouldContinue; ++Iteration; } if (EnablePRE) { + // Fabricate val-num for dead-code in order to suppress assertion in + // performPRE(). + assignValNumForDeadCode(); bool PREChanged = true; while (PREChanged) { PREChanged = performPRE(F); Changed |= PREChanged; } } + // FIXME: Should perform GVN again after PRE does something. PRE can move // computations into blocks where they become fully redundant. Note that // we can't do this until PRE's critical edge splitting updates memdep. // Actually, when this happens, we should just fully integrate PRE into GVN. cleanupGlobalSets(); + // Do not cleanup DeadBlocks in cleanupGlobalSets() as it's called for each + // iteration. + DeadBlocks.clear(); return Changed; } @@ -2326,6 +2375,9 @@ bool GVN::processBlock(BasicBlock *BB) { // (and incrementing BI before processing an instruction). assert(InstrsToErase.empty() && "We expect InstrsToErase to be empty across iterations"); + if (DeadBlocks.count(BB)) + return false; + bool ChangedFunction = false; for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); @@ -2344,7 +2396,7 @@ bool GVN::processBlock(BasicBlock *BB) { if (!AtStart) --BI; - for (SmallVector<Instruction*, 4>::iterator I = InstrsToErase.begin(), + for (SmallVectorImpl<Instruction *>::iterator I = InstrsToErase.begin(), E = InstrsToErase.end(); I != E; ++I) { DEBUG(dbgs() << "GVN removed: " << **I << '\n'); if (MD) MD->removeInstruction(*I); @@ -2543,6 +2595,15 @@ bool GVN::performPRE(Function &F) { return Changed; } +/// Split the critical edge connecting the given two blocks, and return +/// the block inserted to the critical edge. +BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) { + BasicBlock *BB = SplitCriticalEdge(Pred, Succ, this); + if (MD) + MD->invalidateCachedPredecessors(); + return BB; +} + /// splitCriticalEdges - Split critical edges found during the previous /// iteration that may enable further optimization. bool GVN::splitCriticalEdges() { @@ -2569,9 +2630,18 @@ bool GVN::iterateOnFunction(Function &F) { RE = RPOT.end(); RI != RE; ++RI) Changed |= processBlock(*RI); #else + // Save the blocks this function have before transformation begins. GVN may + // split critical edge, and hence may invalidate the RPO/DT iterator. + // + std::vector<BasicBlock *> BBVect; + BBVect.reserve(256); for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()), DE = df_end(DT->getRootNode()); DI != DE; ++DI) - Changed |= processBlock(DI->getBlock()); + BBVect.push_back(DI->getBlock()); + + for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end(); + I != E; I++) + Changed |= processBlock(*I); #endif return Changed; @@ -2601,3 +2671,133 @@ void GVN::verifyRemoved(const Instruction *Inst) const { } } } + +// BB is declared dead, which implied other blocks become dead as well. This +// function is to add all these blocks to "DeadBlocks". For the dead blocks' +// live successors, update their phi nodes by replacing the operands +// corresponding to dead blocks with UndefVal. +// +void GVN::addDeadBlock(BasicBlock *BB) { + SmallVector<BasicBlock *, 4> NewDead; + SmallSetVector<BasicBlock *, 4> DF; + + NewDead.push_back(BB); + while (!NewDead.empty()) { + BasicBlock *D = NewDead.pop_back_val(); + if (DeadBlocks.count(D)) + continue; + + // All blocks dominated by D are dead. + SmallVector<BasicBlock *, 8> Dom; + DT->getDescendants(D, Dom); + DeadBlocks.insert(Dom.begin(), Dom.end()); + + // Figure out the dominance-frontier(D). + for (SmallVectorImpl<BasicBlock *>::iterator I = Dom.begin(), + E = Dom.end(); I != E; I++) { + BasicBlock *B = *I; + for (succ_iterator SI = succ_begin(B), SE = succ_end(B); SI != SE; SI++) { + BasicBlock *S = *SI; + if (DeadBlocks.count(S)) + continue; + + bool AllPredDead = true; + for (pred_iterator PI = pred_begin(S), PE = pred_end(S); PI != PE; PI++) + if (!DeadBlocks.count(*PI)) { + AllPredDead = false; + break; + } + + if (!AllPredDead) { + // S could be proved dead later on. That is why we don't update phi + // operands at this moment. + DF.insert(S); + } else { + // While S is not dominated by D, it is dead by now. This could take + // place if S already have a dead predecessor before D is declared + // dead. + NewDead.push_back(S); + } + } + } + } + + // For the dead blocks' live successors, update their phi nodes by replacing + // the operands corresponding to dead blocks with UndefVal. + for(SmallSetVector<BasicBlock *, 4>::iterator I = DF.begin(), E = DF.end(); + I != E; I++) { + BasicBlock *B = *I; + if (DeadBlocks.count(B)) + continue; + + SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B)); + for (SmallVectorImpl<BasicBlock *>::iterator PI = Preds.begin(), + PE = Preds.end(); PI != PE; PI++) { + BasicBlock *P = *PI; + + if (!DeadBlocks.count(P)) + continue; + + if (isCriticalEdge(P->getTerminator(), GetSuccessorNumber(P, B))) { + if (BasicBlock *S = splitCriticalEdges(P, B)) + DeadBlocks.insert(P = S); + } + + for (BasicBlock::iterator II = B->begin(); isa<PHINode>(II); ++II) { + PHINode &Phi = cast<PHINode>(*II); + Phi.setIncomingValue(Phi.getBasicBlockIndex(P), + UndefValue::get(Phi.getType())); + } + } + } +} + +// If the given branch is recognized as a foldable branch (i.e. conditional +// branch with constant condition), it will perform following analyses and +// transformation. +// 1) If the dead out-coming edge is a critical-edge, split it. Let +// R be the target of the dead out-coming edge. +// 1) Identify the set of dead blocks implied by the branch's dead outcoming +// edge. The result of this step will be {X| X is dominated by R} +// 2) Identify those blocks which haves at least one dead prodecessor. The +// result of this step will be dominance-frontier(R). +// 3) Update the PHIs in DF(R) by replacing the operands corresponding to +// dead blocks with "UndefVal" in an hope these PHIs will optimized away. +// +// Return true iff *NEW* dead code are found. +bool GVN::processFoldableCondBr(BranchInst *BI) { + if (!BI || BI->isUnconditional()) + return false; + + ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition()); + if (!Cond) + return false; + + BasicBlock *DeadRoot = Cond->getZExtValue() ? + BI->getSuccessor(1) : BI->getSuccessor(0); + if (DeadBlocks.count(DeadRoot)) + return false; + + if (!DeadRoot->getSinglePredecessor()) + DeadRoot = splitCriticalEdges(BI->getParent(), DeadRoot); + + addDeadBlock(DeadRoot); + return true; +} + +// performPRE() will trigger assert if it come across an instruciton without +// associated val-num. As it normally has far more live instructions than dead +// instructions, it makes more sense just to "fabricate" a val-number for the +// dead code than checking if instruction involved is dead or not. +void GVN::assignValNumForDeadCode() { + for (SetVector<BasicBlock *>::iterator I = DeadBlocks.begin(), + E = DeadBlocks.end(); I != E; I++) { + BasicBlock *BB = *I; + for (BasicBlock::iterator II = BB->begin(), EE = BB->end(); + II != EE; II++) { + Instruction *Inst = &*II; + unsigned ValNum = VN.lookup_or_add(Inst); + addToLeaderTable(ValNum, Inst, BB); + } + } +} diff --git a/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp b/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp index 4796eb2..954e545 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp @@ -72,15 +72,13 @@ using namespace llvm; static cl::opt<bool> EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden, - cl::desc("Enable global merge pass on constants"), - cl::init(false)); + cl::desc("Enable global merge pass on constants"), + cl::init(false)); STATISTIC(NumMerged , "Number of globals merged"); namespace { class GlobalMerge : public FunctionPass { - /// TLI - Keep a pointer of a TargetLowering to consult for determining - /// target type sizes. - const TargetLowering *TLI; + const TargetMachine *TM; bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals, Module &M, bool isConst, unsigned AddrSpace) const; @@ -104,8 +102,8 @@ namespace { public: static char ID; // Pass identification, replacement for typeid. - explicit GlobalMerge(const TargetLowering *tli = 0) - : FunctionPass(ID), TLI(tli) { + explicit GlobalMerge(const TargetMachine *TM = 0) + : FunctionPass(ID), TM(TM) { initializeGlobalMergePass(*PassRegistry::getPassRegistry()); } @@ -144,6 +142,7 @@ INITIALIZE_PASS(GlobalMerge, "global-merge", bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, Module &M, bool isConst, unsigned AddrSpace) const { + const TargetLowering *TLI = TM->getTargetLowering(); const DataLayout *TD = TLI->getDataLayout(); // FIXME: Infer the maximum possible offset depending on the actual users @@ -234,6 +233,7 @@ void GlobalMerge::setMustKeepGlobalVariables(Module &M) { bool GlobalMerge::doInitialization(Module &M) { DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals, BSSGlobals; + const TargetLowering *TLI = TM->getTargetLowering(); const DataLayout *TD = TLI->getDataLayout(); unsigned MaxOffset = TLI->getMaximalGlobalOffset(); bool Changed = false; @@ -305,6 +305,6 @@ bool GlobalMerge::doFinalization(Module &M) { return false; } -Pass *llvm::createGlobalMergePass(const TargetLowering *tli) { - return new GlobalMerge(tli); +Pass *llvm::createGlobalMergePass(const TargetMachine *TM) { + return new GlobalMerge(TM); } diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 8e76c78..235aaaa 100644 --- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -532,7 +532,8 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { // and varies predictably *inside* the loop. Evaluate the value it // contains when the loop exits, if possible. const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop()); - if (!SE->isLoopInvariant(ExitValue, L)) + if (!SE->isLoopInvariant(ExitValue, L) || + !isSafeToExpand(ExitValue, *SE)) continue; // Computing the value outside of the loop brings no benefit if : @@ -1479,8 +1480,14 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, if (IndVar->getType()->isPointerTy() && !IVCount->getType()->isPointerTy()) { + // IVOffset will be the new GEP offset that is interpreted by GEP as a + // signed value. IVCount on the other hand represents the loop trip count, + // which is an unsigned value. FindLoopCounter only allows induction + // variables that have a positive unit stride of one. This means we don't + // have to handle the case of negative offsets (yet) and just need to zero + // extend IVCount. Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType()); - const SCEV *IVOffset = SE->getTruncateOrSignExtend(IVCount, OfsTy); + const SCEV *IVOffset = SE->getTruncateOrZeroExtend(IVCount, OfsTy); // Expand the code for the iteration count. assert(SE->isLoopInvariant(IVOffset, L) && @@ -1492,7 +1499,7 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, assert(AR->getStart() == SE->getSCEV(GEPBase) && "bad loop counter"); // We could handle pointer IVs other than i8*, but we need to compensate for // gep index scaling. See canExpandBackedgeTakenCount comments. - assert(SE->getSizeOfExpr( + assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()), cast<PointerType>(GEPBase->getType())->getElementType())->isOne() && "unit stride pointer IV must be i8*"); @@ -1506,9 +1513,10 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc). // // Valid Cases: (1) both integers is most common; (2) both may be pointers - // for simple memset-style loops; (3) IVInit is an integer and IVCount is a - // pointer may occur when enable-iv-rewrite generates a canonical IV on top - // of case #2. + // for simple memset-style loops. + // + // IVInit integer and IVCount pointer would only occur if a canonical IV + // were generated on top of case #2, which is not expected. const SCEV *IVLimit = 0; // For unit stride, IVCount = Start + BECount with 2's complement overflow. @@ -1552,44 +1560,23 @@ LinearFunctionTestReplace(Loop *L, SCEVExpander &Rewriter) { assert(canExpandBackedgeTakenCount(L, SE) && "precondition"); - // LFTR can ignore IV overflow and truncate to the width of - // BECount. This avoids materializing the add(zext(add)) expression. - Type *CntTy = BackedgeTakenCount->getType(); - + // Initialize CmpIndVar and IVCount to their preincremented values. + Value *CmpIndVar = IndVar; const SCEV *IVCount = BackedgeTakenCount; // If the exiting block is the same as the backedge block, we prefer to // compare against the post-incremented value, otherwise we must compare // against the preincremented value. - Value *CmpIndVar; if (L->getExitingBlock() == L->getLoopLatch()) { // Add one to the "backedge-taken" count to get the trip count. - // If this addition may overflow, we have to be more pessimistic and - // cast the induction variable before doing the add. - const SCEV *N = - SE->getAddExpr(IVCount, SE->getConstant(IVCount->getType(), 1)); - if (CntTy == IVCount->getType()) - IVCount = N; - else { - const SCEV *Zero = SE->getConstant(IVCount->getType(), 0); - if ((isa<SCEVConstant>(N) && !N->isZero()) || - SE->isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) { - // No overflow. Cast the sum. - IVCount = SE->getTruncateOrZeroExtend(N, CntTy); - } else { - // Potential overflow. Cast before doing the add. - IVCount = SE->getTruncateOrZeroExtend(IVCount, CntTy); - IVCount = SE->getAddExpr(IVCount, SE->getConstant(CntTy, 1)); - } - } + // This addition may overflow, which is valid as long as the comparison is + // truncated to BackedgeTakenCount->getType(). + IVCount = SE->getAddExpr(BackedgeTakenCount, + SE->getConstant(BackedgeTakenCount->getType(), 1)); // The BackedgeTaken expression contains the number of times that the // backedge branches to the loop header. This is one less than the // number of times the loop executes, so use the incremented indvar. CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock()); - } else { - // We must use the preincremented value... - IVCount = SE->getTruncateOrZeroExtend(IVCount, CntTy); - CmpIndVar = IndVar; } Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE); @@ -1612,12 +1599,40 @@ LinearFunctionTestReplace(Loop *L, << " IVCount:\t" << *IVCount << "\n"); IRBuilder<> Builder(BI); - if (SE->getTypeSizeInBits(CmpIndVar->getType()) - > SE->getTypeSizeInBits(ExitCnt->getType())) { - CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(), - "lftr.wideiv"); - } + // LFTR can ignore IV overflow and truncate to the width of + // BECount. This avoids materializing the add(zext(add)) expression. + unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType()); + unsigned ExitCntSize = SE->getTypeSizeInBits(ExitCnt->getType()); + if (CmpIndVarSize > ExitCntSize) { + const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar)); + const SCEV *ARStart = AR->getStart(); + const SCEV *ARStep = AR->getStepRecurrence(*SE); + // For constant IVCount, avoid truncation. + if (isa<SCEVConstant>(ARStart) && isa<SCEVConstant>(IVCount)) { + const APInt &Start = cast<SCEVConstant>(ARStart)->getValue()->getValue(); + APInt Count = cast<SCEVConstant>(IVCount)->getValue()->getValue(); + // Note that the post-inc value of BackedgeTakenCount may have overflowed + // above such that IVCount is now zero. + if (IVCount != BackedgeTakenCount && Count == 0) { + Count = APInt::getMaxValue(Count.getBitWidth()).zext(CmpIndVarSize); + ++Count; + } + else + Count = Count.zext(CmpIndVarSize); + APInt NewLimit; + if (cast<SCEVConstant>(ARStep)->getValue()->isNegative()) + NewLimit = Start - Count; + else + NewLimit = Start + Count; + ExitCnt = ConstantInt::get(CmpIndVar->getType(), NewLimit); + + DEBUG(dbgs() << " Widen RHS:\t" << *ExitCnt << "\n"); + } else { + CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(), + "lftr.wideiv"); + } + } Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond"); Value *OrigCond = BI->getCondition(); // It's tempting to use replaceAllUsesWith here to fully replace the old diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp index b61c5ba..b3ec2fc 100644 --- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" @@ -129,6 +130,7 @@ namespace { bool ProcessBranchOnXOR(BinaryOperator *BO); bool SimplifyPartiallyRedundantLoad(LoadInst *LI); + bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB); }; } @@ -775,7 +777,11 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { return true; } } + } + + if (CondBr && CondConst && TryToUnfoldSelect(CondCmp, BB)) + return true; } // Check for some cases that are worth simplifying. Right now we want to look @@ -821,7 +827,6 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { return false; } - /// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant /// load instruction, eliminate it by replacing it with a PHI node. This is an /// important optimization that encourages jump threading, and needs to be run @@ -836,6 +841,12 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { if (LoadBB->getSinglePredecessor()) return false; + // If the load is defined in a landing pad, it can't be partially redundant, + // because the edges between the invoke and the landing pad cannot have other + // instructions between them. + if (LoadBB->isLandingPad()) + return false; + Value *LoadedPtr = LI->getOperand(0); // If the loaded operand is defined in the LoadBB, it can't be available. @@ -1615,4 +1626,80 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, return true; } +/// TryToUnfoldSelect - Look for blocks of the form +/// bb1: +/// %a = select +/// br bb +/// +/// bb2: +/// %p = phi [%a, %bb] ... +/// %c = icmp %p +/// br i1 %c +/// +/// And expand the select into a branch structure if one of its arms allows %c +/// to be folded. This later enables threading from bb1 over bb2. +bool JumpThreading::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) { + BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); + PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0)); + Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1)); + + if (!CondBr || !CondBr->isConditional() || !CondLHS || + CondLHS->getParent() != BB) + return false; + + for (unsigned I = 0, E = CondLHS->getNumIncomingValues(); I != E; ++I) { + BasicBlock *Pred = CondLHS->getIncomingBlock(I); + SelectInst *SI = dyn_cast<SelectInst>(CondLHS->getIncomingValue(I)); + // Look if one of the incoming values is a select in the corresponding + // predecessor. + if (!SI || SI->getParent() != Pred || !SI->hasOneUse()) + continue; + + BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator()); + if (!PredTerm || !PredTerm->isUnconditional()) + continue; + + // Now check if one of the select values would allow us to constant fold the + // terminator in BB. We don't do the transform if both sides fold, those + // cases will be threaded in any case. + LazyValueInfo::Tristate LHSFolds = + LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1), + CondRHS, Pred, BB); + LazyValueInfo::Tristate RHSFolds = + LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2), + CondRHS, Pred, BB); + if ((LHSFolds != LazyValueInfo::Unknown || + RHSFolds != LazyValueInfo::Unknown) && + LHSFolds != RHSFolds) { + // Expand the select. + // + // Pred -- + // | v + // | NewBB + // | | + // |----- + // v + // BB + BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold", + BB->getParent(), BB); + // Move the unconditional branch to NewBB. + PredTerm->removeFromParent(); + NewBB->getInstList().insert(NewBB->end(), PredTerm); + // Create a conditional branch and update PHI nodes. + BranchInst::Create(NewBB, BB, SI->getCondition(), Pred); + CondLHS->setIncomingValue(I, SI->getFalseValue()); + CondLHS->addIncoming(SI->getTrueValue(), NewBB); + // The select is now dead. + SI->eraseFromParent(); + + // Update any other PHI nodes in BB. + for (BasicBlock::iterator BI = BB->begin(); + PHINode *Phi = dyn_cast<PHINode>(BI); ++BI) + if (Phi != CondLHS) + Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB); + return true; + } + } + return false; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index 0b62050..9e39d2e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -51,8 +51,8 @@ namespace { } private: - bool isLoopDead(Loop *L, SmallVector<BasicBlock*, 4> &exitingBlocks, - SmallVector<BasicBlock*, 4> &exitBlocks, + bool isLoopDead(Loop *L, SmallVectorImpl<BasicBlock *> &exitingBlocks, + SmallVectorImpl<BasicBlock *> &exitBlocks, bool &Changed, BasicBlock *Preheader); }; @@ -77,8 +77,8 @@ Pass *llvm::createLoopDeletionPass() { /// checked for unique exit and exiting blocks, and that the code is in LCSSA /// form. bool LoopDeletion::isLoopDead(Loop *L, - SmallVector<BasicBlock*, 4> &exitingBlocks, - SmallVector<BasicBlock*, 4> &exitBlocks, + SmallVectorImpl<BasicBlock *> &exitingBlocks, + SmallVectorImpl<BasicBlock *> &exitBlocks, bool &Changed, BasicBlock *Preheader) { BasicBlock *exitBlock = exitBlocks[0]; @@ -209,7 +209,7 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { // Move all of the block's children to be children of the preheader, which // allows us to remove the domtree entry for the block. ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end()); - for (SmallVector<DomTreeNode*, 8>::iterator DI = ChildNodes.begin(), + for (SmallVectorImpl<DomTreeNode *>::iterator DI = ChildNodes.begin(), DE = ChildNodes.end(); DI != DE; ++DI) { DT.changeImmediateDominator(*DI, DT[preheader]); } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 8258719..952b76b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -81,7 +81,7 @@ namespace { /// Return the condition of the branch terminating the given basic block. static Value *getBrCondtion(BasicBlock *); - /// Derive the precondition block (i.e the block that guards the loop + /// Derive the precondition block (i.e the block that guards the loop /// preheader) from the given preheader. static BasicBlock *getPrecondBb(BasicBlock *PreHead); }; @@ -111,7 +111,7 @@ namespace { /// beween a variable and zero, and if the variable is non-zero, the /// control yeilds to the loop entry. If the branch matches the behavior, /// the variable involved in the comparion is returned. This function will - /// be called to see if the precondition and postcondition of the loop + /// be called to see if the precondition and postcondition of the loop /// are in desirable form. Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const; @@ -274,11 +274,11 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE, // //===----------------------------------------------------------------------===// -// This fucntion will return true iff the given block contains nothing but goto. -// A typical usage of this function is to check if the preheader fucntion is -// "almost" empty such that generated intrinsic function can be moved across -// preheader and to be placed at the end of the preconditiona block without -// concerning of breaking data dependence. +// This function will return true iff the given block contains nothing but goto. +// A typical usage of this function is to check if the preheader function is +// "almost" empty such that generated intrinsic functions can be moved across +// the preheader and be placed at the end of the precondition block without +// the concern of breaking data dependence. bool LIRUtil::isAlmostEmpty(BasicBlock *BB) { if (BranchInst *Br = getBranch(BB)) { return Br->isUnconditional() && BB->size() == 1; @@ -314,7 +314,7 @@ bool NclPopcountRecognize::preliminaryScreen() { if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) return false; - // Counting population are usually conducted by few arithmetic instrutions. + // Counting population are usually conducted by few arithmetic instructions. // Such instructions can be easilly "absorbed" by vacant slots in a // non-compact loop. Therefore, recognizing popcount idiom only makes sense // in a compact loop. @@ -339,7 +339,7 @@ bool NclPopcountRecognize::preliminaryScreen() { PreCondBB = LIRUtil::getPrecondBb(PreHead); if (!PreCondBB) return false; - + return true; } @@ -504,7 +504,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst, // Assuming before transformation, the loop is following: // if (x) // the precondition // do { cnt++; x &= x - 1; } while(x); - + // Step 1: Insert the ctpop instruction at the end of the precondition block IRBuilderTy Builder(PreCondBr); Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; @@ -611,7 +611,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst, SE->forgetLoop(CurLoop); } -CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, +CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, Value *Val, DebugLoc DL) { Value *Ops[] = { Val }; Type *Tys[] = { Val->getType() }; @@ -667,13 +667,13 @@ bool LoopIdiomRecognize::runOnCountableLoop() { if (!getDataLayout()) return false; - // set DT + // set DT (void)getDominatorTree(); LoopInfo &LI = getAnalysis<LoopInfo>(); TLI = &getAnalysis<TargetLibraryInfo>(); - // set TLI + // set TLI (void)getTargetLibraryInfo(); SmallVector<BasicBlock*, 8> ExitBlocks; @@ -953,6 +953,8 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, Value *SplatValue = isBytewiseValue(StoredVal); Constant *PatternValue = 0; + unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); + // If we're allowed to form a memset, and the stored value would be acceptable // for memset, use it. if (SplatValue && TLI->has(LibFunc::memset) && @@ -961,8 +963,10 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, CurLoop->isLoopInvariant(SplatValue)) { // Keep and use SplatValue. PatternValue = 0; - } else if (TLI->has(LibFunc::memset_pattern16) && + } else if (DestAS == 0 && + TLI->has(LibFunc::memset_pattern16) && (PatternValue = getMemSetPatternValue(StoredVal, *TD))) { + // Don't create memset_pattern16s with address spaces. // It looks like we can use PatternValue! SplatValue = 0; } else { @@ -978,20 +982,20 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, IRBuilder<> Builder(Preheader->getTerminator()); SCEVExpander Expander(*SE, "loop-idiom"); + Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); + // Okay, we have a strided store "p[i]" of a splattable value. We can turn // this into a memset in the loop preheader now if we want. However, this // would be unsafe to do if there is anything else in the loop that may read // or write to the aliased location. Check for any overlap by generating the // base pointer and checking the region. - unsigned AddrSpace = cast<PointerType>(DestPtr->getType())->getAddressSpace(); Value *BasePtr = - Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace), + Expander.expandCodeFor(Ev->getStart(), DestInt8PtrTy, Preheader->getTerminator()); - if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef, CurLoop, BECount, - StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){ + StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) { Expander.clear(); // If we generated new code for the base pointer, clean up. deleteIfDeadInstruction(BasePtr, *SE, TLI); @@ -1002,27 +1006,35 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtr = TD->getIntPtrType(DestPtr->getContext()); + Type *IntPtr = Builder.getIntPtrTy(TD, DestAS); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), SCEV::FlagNUW); - if (StoreSize != 1) + if (StoreSize != 1) { NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), SCEV::FlagNUW); + } Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); CallInst *NewCall; - if (SplatValue) - NewCall = Builder.CreateMemSet(BasePtr, SplatValue,NumBytes,StoreAlignment); - else { + if (SplatValue) { + NewCall = Builder.CreateMemSet(BasePtr, + SplatValue, + NumBytes, + StoreAlignment); + } else { + // Everything is emitted in default address space + Type *Int8PtrTy = DestInt8PtrTy; + Module *M = TheStore->getParent()->getParent()->getParent(); Value *MSP = M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(), - Builder.getInt8PtrTy(), - Builder.getInt8PtrTy(), IntPtr, + Int8PtrTy, + Int8PtrTy, + IntPtr, (void*)0); // Otherwise we should form a memset_pattern16. PatternValue is known to be @@ -1032,7 +1044,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, PatternValue, ".memset_pattern"); GV->setUnnamedAddr(true); // Ok to merge these. GV->setAlignment(16); - Value *PatternPtr = ConstantExpr::getBitCast(GV, Builder.getInt8PtrTy()); + Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes); } @@ -1108,17 +1120,17 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtr = TD->getIntPtrType(SI->getContext()); - BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); + Type *IntPtrTy = Builder.getIntPtrTy(TD, SI->getPointerAddressSpace()); + BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); - const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), + const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1), SCEV::FlagNUW); if (StoreSize != 1) - NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), + NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), SCEV::FlagNUW); Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); + Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); CallInst *NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp new file mode 100644 index 0000000..335af81 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -0,0 +1,1184 @@ +//===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements a simple loop reroller. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-reroll" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + +using namespace llvm; + +STATISTIC(NumRerolledLoops, "Number of rerolled loops"); + +static cl::opt<unsigned> +MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden, + cl::desc("The maximum increment for loop rerolling")); + +// This loop re-rolling transformation aims to transform loops like this: +// +// int foo(int a); +// void bar(int *x) { +// for (int i = 0; i < 500; i += 3) { +// foo(i); +// foo(i+1); +// foo(i+2); +// } +// } +// +// into a loop like this: +// +// void bar(int *x) { +// for (int i = 0; i < 500; ++i) +// foo(i); +// } +// +// It does this by looking for loops that, besides the latch code, are composed +// of isomorphic DAGs of instructions, with each DAG rooted at some increment +// to the induction variable, and where each DAG is isomorphic to the DAG +// rooted at the induction variable (excepting the sub-DAGs which root the +// other induction-variable increments). In other words, we're looking for loop +// bodies of the form: +// +// %iv = phi [ (preheader, ...), (body, %iv.next) ] +// f(%iv) +// %iv.1 = add %iv, 1 <-- a root increment +// f(%iv.1) +// %iv.2 = add %iv, 2 <-- a root increment +// f(%iv.2) +// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment +// f(%iv.scale_m_1) +// ... +// %iv.next = add %iv, scale +// %cmp = icmp(%iv, ...) +// br %cmp, header, exit +// +// where each f(i) is a set of instructions that, collectively, are a function +// only of i (and other loop-invariant values). +// +// As a special case, we can also reroll loops like this: +// +// int foo(int); +// void bar(int *x) { +// for (int i = 0; i < 500; ++i) { +// x[3*i] = foo(0); +// x[3*i+1] = foo(0); +// x[3*i+2] = foo(0); +// } +// } +// +// into this: +// +// void bar(int *x) { +// for (int i = 0; i < 1500; ++i) +// x[i] = foo(0); +// } +// +// in which case, we're looking for inputs like this: +// +// %iv = phi [ (preheader, ...), (body, %iv.next) ] +// %scaled.iv = mul %iv, scale +// f(%scaled.iv) +// %scaled.iv.1 = add %scaled.iv, 1 +// f(%scaled.iv.1) +// %scaled.iv.2 = add %scaled.iv, 2 +// f(%scaled.iv.2) +// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 +// f(%scaled.iv.scale_m_1) +// ... +// %iv.next = add %iv, 1 +// %cmp = icmp(%iv, ...) +// br %cmp, header, exit + +namespace { + class LoopReroll : public LoopPass { + public: + static char ID; // Pass ID, replacement for typeid + LoopReroll() : LoopPass(ID) { + initializeLoopRerollPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<AliasAnalysis>(); + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTree>(); + AU.addRequired<ScalarEvolution>(); + AU.addRequired<TargetLibraryInfo>(); + } + +protected: + AliasAnalysis *AA; + LoopInfo *LI; + ScalarEvolution *SE; + DataLayout *DL; + TargetLibraryInfo *TLI; + DominatorTree *DT; + + typedef SmallVector<Instruction *, 16> SmallInstructionVector; + typedef SmallSet<Instruction *, 16> SmallInstructionSet; + + // A chain of isomorphic instructions, indentified by a single-use PHI, + // representing a reduction. Only the last value may be used outside the + // loop. + struct SimpleLoopReduction { + SimpleLoopReduction(Instruction *P, Loop *L) + : Valid(false), Instructions(1, P) { + assert(isa<PHINode>(P) && "First reduction instruction must be a PHI"); + add(L); + } + + bool valid() const { + return Valid; + } + + Instruction *getPHI() const { + assert(Valid && "Using invalid reduction"); + return Instructions.front(); + } + + Instruction *getReducedValue() const { + assert(Valid && "Using invalid reduction"); + return Instructions.back(); + } + + Instruction *get(size_t i) const { + assert(Valid && "Using invalid reduction"); + return Instructions[i+1]; + } + + Instruction *operator [] (size_t i) const { return get(i); } + + // The size, ignoring the initial PHI. + size_t size() const { + assert(Valid && "Using invalid reduction"); + return Instructions.size()-1; + } + + typedef SmallInstructionVector::iterator iterator; + typedef SmallInstructionVector::const_iterator const_iterator; + + iterator begin() { + assert(Valid && "Using invalid reduction"); + return llvm::next(Instructions.begin()); + } + + const_iterator begin() const { + assert(Valid && "Using invalid reduction"); + return llvm::next(Instructions.begin()); + } + + iterator end() { return Instructions.end(); } + const_iterator end() const { return Instructions.end(); } + + protected: + bool Valid; + SmallInstructionVector Instructions; + + void add(Loop *L); + }; + + // The set of all reductions, and state tracking of possible reductions + // during loop instruction processing. + struct ReductionTracker { + typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector; + + // Add a new possible reduction. + void addSLR(SimpleLoopReduction &SLR) { + PossibleReds.push_back(SLR); + } + + // Setup to track possible reductions corresponding to the provided + // rerolling scale. Only reductions with a number of non-PHI instructions + // that is divisible by the scale are considered. Three instructions sets + // are filled in: + // - A set of all possible instructions in eligible reductions. + // - A set of all PHIs in eligible reductions + // - A set of all reduced values (last instructions) in eligible reductions. + void restrictToScale(uint64_t Scale, + SmallInstructionSet &PossibleRedSet, + SmallInstructionSet &PossibleRedPHISet, + SmallInstructionSet &PossibleRedLastSet) { + PossibleRedIdx.clear(); + PossibleRedIter.clear(); + Reds.clear(); + + for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i) + if (PossibleReds[i].size() % Scale == 0) { + PossibleRedLastSet.insert(PossibleReds[i].getReducedValue()); + PossibleRedPHISet.insert(PossibleReds[i].getPHI()); + + PossibleRedSet.insert(PossibleReds[i].getPHI()); + PossibleRedIdx[PossibleReds[i].getPHI()] = i; + for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(), + JE = PossibleReds[i].end(); J != JE; ++J) { + PossibleRedSet.insert(*J); + PossibleRedIdx[*J] = i; + } + } + } + + // The functions below are used while processing the loop instructions. + + // Are the two instructions both from reductions, and furthermore, from + // the same reduction? + bool isPairInSame(Instruction *J1, Instruction *J2) { + DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1); + if (J1I != PossibleRedIdx.end()) { + DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2); + if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second) + return true; + } + + return false; + } + + // The two provided instructions, the first from the base iteration, and + // the second from iteration i, form a matched pair. If these are part of + // a reduction, record that fact. + void recordPair(Instruction *J1, Instruction *J2, unsigned i) { + if (PossibleRedIdx.count(J1)) { + assert(PossibleRedIdx.count(J2) && + "Recording reduction vs. non-reduction instruction?"); + + PossibleRedIter[J1] = 0; + PossibleRedIter[J2] = i; + + int Idx = PossibleRedIdx[J1]; + assert(Idx == PossibleRedIdx[J2] && + "Recording pair from different reductions?"); + Reds.insert(Idx); + } + } + + // The functions below can be called after we've finished processing all + // instructions in the loop, and we know which reductions were selected. + + // Is the provided instruction the PHI of a reduction selected for + // rerolling? + bool isSelectedPHI(Instruction *J) { + if (!isa<PHINode>(J)) + return false; + + for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); + RI != RIE; ++RI) { + int i = *RI; + if (cast<Instruction>(J) == PossibleReds[i].getPHI()) + return true; + } + + return false; + } + + bool validateSelected(); + void replaceSelected(); + + protected: + // The vector of all possible reductions (for any scale). + SmallReductionVector PossibleReds; + + DenseMap<Instruction *, int> PossibleRedIdx; + DenseMap<Instruction *, int> PossibleRedIter; + DenseSet<int> Reds; + }; + + void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); + void collectPossibleReductions(Loop *L, + ReductionTracker &Reductions); + void collectInLoopUserSet(Loop *L, + const SmallInstructionVector &Roots, + const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users); + void collectInLoopUserSet(Loop *L, + Instruction * Root, + const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users); + bool findScaleFromMul(Instruction *RealIV, uint64_t &Scale, + Instruction *&IV, + SmallInstructionVector &LoopIncs); + bool collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, Instruction *IV, + SmallVector<SmallInstructionVector, 32> &Roots, + SmallInstructionSet &AllRoots, + SmallInstructionVector &LoopIncs); + bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, + ReductionTracker &Reductions); + }; +} + +char LoopReroll::ID = 0; +INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) + +Pass *llvm::createLoopRerollPass() { + return new LoopReroll; +} + +// Returns true if the provided instruction is used outside the given loop. +// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in +// non-loop blocks to be outside the loop. +static bool hasUsesOutsideLoop(Instruction *I, Loop *L) { + for (Value::use_iterator UI = I->use_begin(), + UIE = I->use_end(); UI != UIE; ++UI) { + Instruction *User = cast<Instruction>(*UI); + if (!L->contains(User)) + return true; + } + + return false; +} + +// Collect the list of loop induction variables with respect to which it might +// be possible to reroll the loop. +void LoopReroll::collectPossibleIVs(Loop *L, + SmallInstructionVector &PossibleIVs) { + BasicBlock *Header = L->getHeader(); + for (BasicBlock::iterator I = Header->begin(), + IE = Header->getFirstInsertionPt(); I != IE; ++I) { + if (!isa<PHINode>(I)) + continue; + if (!I->getType()->isIntegerTy()) + continue; + + if (const SCEVAddRecExpr *PHISCEV = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(I))) { + if (PHISCEV->getLoop() != L) + continue; + if (!PHISCEV->isAffine()) + continue; + if (const SCEVConstant *IncSCEV = + dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) { + if (!IncSCEV->getValue()->getValue().isStrictlyPositive()) + continue; + if (IncSCEV->getValue()->uge(MaxInc)) + continue; + + DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << + *PHISCEV << "\n"); + PossibleIVs.push_back(I); + } + } + } +} + +// Add the remainder of the reduction-variable chain to the instruction vector +// (the initial PHINode has already been added). If successful, the object is +// marked as valid. +void LoopReroll::SimpleLoopReduction::add(Loop *L) { + assert(!Valid && "Cannot add to an already-valid chain"); + + // The reduction variable must be a chain of single-use instructions + // (including the PHI), except for the last value (which is used by the PHI + // and also outside the loop). + Instruction *C = Instructions.front(); + + do { + C = cast<Instruction>(*C->use_begin()); + if (C->hasOneUse()) { + if (!C->isBinaryOp()) + return; + + if (!(isa<PHINode>(Instructions.back()) || + C->isSameOperationAs(Instructions.back()))) + return; + + Instructions.push_back(C); + } + } while (C->hasOneUse()); + + if (Instructions.size() < 2 || + !C->isSameOperationAs(Instructions.back()) || + C->use_begin() == C->use_end()) + return; + + // C is now the (potential) last instruction in the reduction chain. + for (Value::use_iterator UI = C->use_begin(), UIE = C->use_end(); + UI != UIE; ++UI) { + // The only in-loop user can be the initial PHI. + if (L->contains(cast<Instruction>(*UI))) + if (cast<Instruction>(*UI ) != Instructions.front()) + return; + } + + Instructions.push_back(C); + Valid = true; +} + +// Collect the vector of possible reduction variables. +void LoopReroll::collectPossibleReductions(Loop *L, + ReductionTracker &Reductions) { + BasicBlock *Header = L->getHeader(); + for (BasicBlock::iterator I = Header->begin(), + IE = Header->getFirstInsertionPt(); I != IE; ++I) { + if (!isa<PHINode>(I)) + continue; + if (!I->getType()->isSingleValueType()) + continue; + + SimpleLoopReduction SLR(I, L); + if (!SLR.valid()) + continue; + + DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " << + SLR.size() << " chained instructions)\n"); + Reductions.addSLR(SLR); + } +} + +// Collect the set of all users of the provided root instruction. This set of +// users contains not only the direct users of the root instruction, but also +// all users of those users, and so on. There are two exceptions: +// +// 1. Instructions in the set of excluded instructions are never added to the +// use set (even if they are users). This is used, for example, to exclude +// including root increments in the use set of the primary IV. +// +// 2. Instructions in the set of final instructions are added to the use set +// if they are users, but their users are not added. This is used, for +// example, to prevent a reduction update from forcing all later reduction +// updates into the use set. +void LoopReroll::collectInLoopUserSet(Loop *L, + Instruction *Root, const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users) { + SmallInstructionVector Queue(1, Root); + while (!Queue.empty()) { + Instruction *I = Queue.pop_back_val(); + if (!Users.insert(I).second) + continue; + + if (!Final.count(I)) + for (Value::use_iterator UI = I->use_begin(), + UIE = I->use_end(); UI != UIE; ++UI) { + Instruction *User = cast<Instruction>(*UI); + if (PHINode *PN = dyn_cast<PHINode>(User)) { + // Ignore "wrap-around" uses to PHIs of this loop's header. + if (PN->getIncomingBlock(UI) == L->getHeader()) + continue; + } + + if (L->contains(User) && !Exclude.count(User)) { + Queue.push_back(User); + } + } + + // We also want to collect single-user "feeder" values. + for (User::op_iterator OI = I->op_begin(), + OIE = I->op_end(); OI != OIE; ++OI) { + if (Instruction *Op = dyn_cast<Instruction>(*OI)) + if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) && + !Final.count(Op)) + Queue.push_back(Op); + } + } +} + +// Collect all of the users of all of the provided root instructions (combined +// into a single set). +void LoopReroll::collectInLoopUserSet(Loop *L, + const SmallInstructionVector &Roots, + const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users) { + for (SmallInstructionVector::const_iterator I = Roots.begin(), + IE = Roots.end(); I != IE; ++I) + collectInLoopUserSet(L, *I, Exclude, Final, Users); +} + +static bool isSimpleLoadStore(Instruction *I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return LI->isSimple(); + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return SI->isSimple(); + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) + return !MI->isVolatile(); + return false; +} + +// Recognize loops that are setup like this: +// +// %iv = phi [ (preheader, ...), (body, %iv.next) ] +// %scaled.iv = mul %iv, scale +// f(%scaled.iv) +// %scaled.iv.1 = add %scaled.iv, 1 +// f(%scaled.iv.1) +// %scaled.iv.2 = add %scaled.iv, 2 +// f(%scaled.iv.2) +// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 +// f(%scaled.iv.scale_m_1) +// ... +// %iv.next = add %iv, 1 +// %cmp = icmp(%iv, ...) +// br %cmp, header, exit +// +// and, if found, set IV = %scaled.iv, and add %iv.next to LoopIncs. +bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale, + Instruction *&IV, + SmallInstructionVector &LoopIncs) { + // This is a special case: here we're looking for all uses (except for + // the increment) to be multiplied by a common factor. The increment must + // be by one. This is to capture loops like: + // for (int i = 0; i < 500; ++i) { + // foo(3*i); foo(3*i+1); foo(3*i+2); + // } + if (RealIV->getNumUses() != 2) + return false; + const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(RealIV)); + Instruction *User1 = cast<Instruction>(*RealIV->use_begin()), + *User2 = cast<Instruction>(*llvm::next(RealIV->use_begin())); + if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType())) + return false; + const SCEVAddRecExpr *User1SCEV = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User1)), + *User2SCEV = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User2)); + if (!User1SCEV || !User1SCEV->isAffine() || + !User2SCEV || !User2SCEV->isAffine()) + return false; + + // We assume below that User1 is the scale multiply and User2 is the + // increment. If this can't be true, then swap them. + if (User1SCEV == RealIVSCEV->getPostIncExpr(*SE)) { + std::swap(User1, User2); + std::swap(User1SCEV, User2SCEV); + } + + if (User2SCEV != RealIVSCEV->getPostIncExpr(*SE)) + return false; + assert(User2SCEV->getStepRecurrence(*SE)->isOne() && + "Invalid non-unit step for multiplicative scaling"); + LoopIncs.push_back(User2); + + if (const SCEVConstant *MulScale = + dyn_cast<SCEVConstant>(User1SCEV->getStepRecurrence(*SE))) { + // Make sure that both the start and step have the same multiplier. + if (RealIVSCEV->getStart()->getType() != MulScale->getType()) + return false; + if (SE->getMulExpr(RealIVSCEV->getStart(), MulScale) != + User1SCEV->getStart()) + return false; + + ConstantInt *MulScaleCI = MulScale->getValue(); + if (!MulScaleCI->uge(2) || MulScaleCI->uge(MaxInc)) + return false; + Scale = MulScaleCI->getZExtValue(); + IV = User1; + } else + return false; + + DEBUG(dbgs() << "LRR: Found possible scaling " << *User1 << "\n"); + return true; +} + +// Collect all root increments with respect to the provided induction variable +// (normally the PHI, but sometimes a multiply). A root increment is an +// instruction, normally an add, with a positive constant less than Scale. In a +// rerollable loop, each of these increments is the root of an instruction +// graph isomorphic to the others. Also, we collect the final induction +// increment (the increment equal to the Scale), and its users in LoopIncs. +bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, + Instruction *IV, + SmallVector<SmallInstructionVector, 32> &Roots, + SmallInstructionSet &AllRoots, + SmallInstructionVector &LoopIncs) { + for (Value::use_iterator UI = IV->use_begin(), + UIE = IV->use_end(); UI != UIE; ++UI) { + Instruction *User = cast<Instruction>(*UI); + if (!SE->isSCEVable(User->getType())) + continue; + if (User->getType() != IV->getType()) + continue; + if (!L->contains(User)) + continue; + if (hasUsesOutsideLoop(User, L)) + continue; + + if (const SCEVConstant *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV( + SE->getSCEV(User), SE->getSCEV(IV)))) { + uint64_t Idx = Diff->getValue()->getValue().getZExtValue(); + if (Idx > 0 && Idx < Scale) { + Roots[Idx-1].push_back(User); + AllRoots.insert(User); + } else if (Idx == Scale && Inc > 1) { + LoopIncs.push_back(User); + } + } + } + + if (Roots[0].empty()) + return false; + bool AllSame = true; + for (unsigned i = 1; i < Scale-1; ++i) + if (Roots[i].size() != Roots[0].size()) { + AllSame = false; + break; + } + + if (!AllSame) + return false; + + return true; +} + +// Validate the selected reductions. All iterations must have an isomorphic +// part of the reduction chain and, for non-associative reductions, the chain +// entries must appear in order. +bool LoopReroll::ReductionTracker::validateSelected() { + // For a non-associative reduction, the chain entries must appear in order. + for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); + RI != RIE; ++RI) { + int i = *RI; + int PrevIter = 0, BaseCount = 0, Count = 0; + for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(), + JE = PossibleReds[i].end(); J != JE; ++J) { + // Note that all instructions in the chain must have been found because + // all instructions in the function must have been assigned to some + // iteration. + int Iter = PossibleRedIter[*J]; + if (Iter != PrevIter && Iter != PrevIter + 1 && + !PossibleReds[i].getReducedValue()->isAssociative()) { + DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " << + *J << "\n"); + return false; + } + + if (Iter != PrevIter) { + if (Count != BaseCount) { + DEBUG(dbgs() << "LRR: Iteration " << PrevIter << + " reduction use count " << Count << + " is not equal to the base use count " << + BaseCount << "\n"); + return false; + } + + Count = 0; + } + + ++Count; + if (Iter == 0) + ++BaseCount; + + PrevIter = Iter; + } + } + + return true; +} + +// For all selected reductions, remove all parts except those in the first +// iteration (and the PHI). Replace outside uses of the reduced value with uses +// of the first-iteration reduced value (in other words, reroll the selected +// reductions). +void LoopReroll::ReductionTracker::replaceSelected() { + // Fixup reductions to refer to the last instruction associated with the + // first iteration (not the last). + for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); + RI != RIE; ++RI) { + int i = *RI; + int j = 0; + for (int e = PossibleReds[i].size(); j != e; ++j) + if (PossibleRedIter[PossibleReds[i][j]] != 0) { + --j; + break; + } + + // Replace users with the new end-of-chain value. + SmallInstructionVector Users; + for (Value::use_iterator UI = + PossibleReds[i].getReducedValue()->use_begin(), + UIE = PossibleReds[i].getReducedValue()->use_end(); UI != UIE; ++UI) + Users.push_back(cast<Instruction>(*UI)); + + for (SmallInstructionVector::iterator J = Users.begin(), + JE = Users.end(); J != JE; ++J) + (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(), + PossibleReds[i][j]); + } +} + +// Reroll the provided loop with respect to the provided induction variable. +// Generally, we're looking for a loop like this: +// +// %iv = phi [ (preheader, ...), (body, %iv.next) ] +// f(%iv) +// %iv.1 = add %iv, 1 <-- a root increment +// f(%iv.1) +// %iv.2 = add %iv, 2 <-- a root increment +// f(%iv.2) +// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment +// f(%iv.scale_m_1) +// ... +// %iv.next = add %iv, scale +// %cmp = icmp(%iv, ...) +// br %cmp, header, exit +// +// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of +// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can +// be intermixed with eachother. The restriction imposed by this algorithm is +// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1), +// etc. be the same. +// +// First, we collect the use set of %iv, excluding the other increment roots. +// This gives us f(%iv). Then we iterate over the loop instructions (scale-1) +// times, having collected the use set of f(%iv.(i+1)), during which we: +// - Ensure that the next unmatched instruction in f(%iv) is isomorphic to +// the next unmatched instruction in f(%iv.(i+1)). +// - Ensure that both matched instructions don't have any external users +// (with the exception of last-in-chain reduction instructions). +// - Track the (aliasing) write set, and other side effects, of all +// instructions that belong to future iterations that come before the matched +// instructions. If the matched instructions read from that write set, then +// f(%iv) or f(%iv.(i+1)) has some dependency on instructions in +// f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly, +// if any of these future instructions had side effects (could not be +// speculatively executed), and so do the matched instructions, when we +// cannot reorder those side-effect-producing instructions, and rerolling +// fails. +// +// Finally, we make sure that all loop instructions are either loop increment +// roots, belong to simple latch code, parts of validated reductions, part of +// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions +// have been validated), then we reroll the loop. +bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, + const SCEV *IterCount, + ReductionTracker &Reductions) { + const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV)); + uint64_t Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))-> + getValue()->getZExtValue(); + // The collection of loop increment instructions. + SmallInstructionVector LoopIncs; + uint64_t Scale = Inc; + + // The effective induction variable, IV, is normally also the real induction + // variable. When we're dealing with a loop like: + // for (int i = 0; i < 500; ++i) + // x[3*i] = ...; + // x[3*i+1] = ...; + // x[3*i+2] = ...; + // then the real IV is still i, but the effective IV is (3*i). + Instruction *RealIV = IV; + if (Inc == 1 && !findScaleFromMul(RealIV, Scale, IV, LoopIncs)) + return false; + + assert(Scale <= MaxInc && "Scale is too large"); + assert(Scale > 1 && "Scale must be at least 2"); + + // The set of increment instructions for each increment value. + SmallVector<SmallInstructionVector, 32> Roots(Scale-1); + SmallInstructionSet AllRoots; + if (!collectAllRoots(L, Inc, Scale, IV, Roots, AllRoots, LoopIncs)) + return false; + + DEBUG(dbgs() << "LRR: Found all root induction increments for: " << + *RealIV << "\n"); + + // An array of just the possible reductions for this scale factor. When we + // collect the set of all users of some root instructions, these reduction + // instructions are treated as 'final' (their uses are not considered). + // This is important because we don't want the root use set to search down + // the reduction chain. + SmallInstructionSet PossibleRedSet; + SmallInstructionSet PossibleRedLastSet, PossibleRedPHISet; + Reductions.restrictToScale(Scale, PossibleRedSet, PossibleRedPHISet, + PossibleRedLastSet); + + // We now need to check for equivalence of the use graph of each root with + // that of the primary induction variable (excluding the roots). Our goal + // here is not to solve the full graph isomorphism problem, but rather to + // catch common cases without a lot of work. As a result, we will assume + // that the relative order of the instructions in each unrolled iteration + // is the same (although we will not make an assumption about how the + // different iterations are intermixed). Note that while the order must be + // the same, the instructions may not be in the same basic block. + SmallInstructionSet Exclude(AllRoots); + Exclude.insert(LoopIncs.begin(), LoopIncs.end()); + + DenseSet<Instruction *> BaseUseSet; + collectInLoopUserSet(L, IV, Exclude, PossibleRedSet, BaseUseSet); + + DenseSet<Instruction *> AllRootUses; + std::vector<DenseSet<Instruction *> > RootUseSets(Scale-1); + + bool MatchFailed = false; + for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) { + DenseSet<Instruction *> &RootUseSet = RootUseSets[i]; + collectInLoopUserSet(L, Roots[i], SmallInstructionSet(), + PossibleRedSet, RootUseSet); + + DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() << + " vs. iteration increment " << (i+1) << + " use set size: " << RootUseSet.size() << "\n"); + + if (BaseUseSet.size() != RootUseSet.size()) { + MatchFailed = true; + break; + } + + // In addition to regular aliasing information, we need to look for + // instructions from later (future) iterations that have side effects + // preventing us from reordering them past other instructions with side + // effects. + bool FutureSideEffects = false; + AliasSetTracker AST(*AA); + + // The map between instructions in f(%iv.(i+1)) and f(%iv). + DenseMap<Value *, Value *> BaseMap; + + assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops"); + for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(), + JE = Header->end(); J1 != JE && !MatchFailed; ++J1) { + if (cast<Instruction>(J1) == RealIV) + continue; + if (cast<Instruction>(J1) == IV) + continue; + if (!BaseUseSet.count(J1)) + continue; + if (PossibleRedPHISet.count(J1)) // Skip reduction PHIs. + continue; + + while (J2 != JE && (!RootUseSet.count(J2) || + std::find(Roots[i].begin(), Roots[i].end(), J2) != + Roots[i].end())) { + // As we iterate through the instructions, instructions that don't + // belong to previous iterations (or the base case), must belong to + // future iterations. We want to track the alias set of writes from + // previous iterations. + if (!isa<PHINode>(J2) && !BaseUseSet.count(J2) && + !AllRootUses.count(J2)) { + if (J2->mayWriteToMemory()) + AST.add(J2); + + // Note: This is specifically guarded by a check on isa<PHINode>, + // which while a valid (somewhat arbitrary) micro-optimization, is + // needed because otherwise isSafeToSpeculativelyExecute returns + // false on PHI nodes. + if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL)) + FutureSideEffects = true; + } + + ++J2; + } + + if (!J1->isSameOperationAs(J2)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << "\n"); + MatchFailed = true; + break; + } + + // Make sure that this instruction, which is in the use set of this + // root instruction, does not also belong to the base set or the set of + // some previous root instruction. + if (BaseUseSet.count(J2) || AllRootUses.count(J2)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << " (prev. case overlap)\n"); + MatchFailed = true; + break; + } + + // Make sure that we don't alias with any instruction in the alias set + // tracker. If we do, then we depend on a future iteration, and we + // can't reroll. + if (J2->mayReadFromMemory()) { + for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end(); + K != KE && !MatchFailed; ++K) { + if (K->aliasesUnknownInst(J2, *AA)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << " (depends on future store)\n"); + MatchFailed = true; + break; + } + } + } + + // If we've past an instruction from a future iteration that may have + // side effects, and this instruction might also, then we can't reorder + // them, and this matching fails. As an exception, we allow the alias + // set tracker to handle regular (simple) load/store dependencies. + if (FutureSideEffects && + ((!isSimpleLoadStore(J1) && !isSafeToSpeculativelyExecute(J1)) || + (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2)))) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << + " (side effects prevent reordering)\n"); + MatchFailed = true; + break; + } + + // For instructions that are part of a reduction, if the operation is + // associative, then don't bother matching the operands (because we + // already know that the instructions are isomorphic, and the order + // within the iteration does not matter). For non-associative reductions, + // we do need to match the operands, because we need to reject + // out-of-order instructions within an iteration! + // For example (assume floating-point addition), we need to reject this: + // x += a[i]; x += b[i]; + // x += a[i+1]; x += b[i+1]; + // x += b[i+2]; x += a[i+2]; + bool InReduction = Reductions.isPairInSame(J1, J2); + + if (!(InReduction && J1->isAssociative())) { + bool Swapped = false, SomeOpMatched = false;; + for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) { + Value *Op2 = J2->getOperand(j); + + // If this is part of a reduction (and the operation is not + // associatve), then we match all operands, but not those that are + // part of the reduction. + if (InReduction) + if (Instruction *Op2I = dyn_cast<Instruction>(Op2)) + if (Reductions.isPairInSame(J2, Op2I)) + continue; + + DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2); + if (BMI != BaseMap.end()) + Op2 = BMI->second; + else if (std::find(Roots[i].begin(), Roots[i].end(), + (Instruction*) Op2) != Roots[i].end()) + Op2 = IV; + + if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) { + // If we've not already decided to swap the matched operands, and + // we've not already matched our first operand (note that we could + // have skipped matching the first operand because it is part of a + // reduction above), and the instruction is commutative, then try + // the swapped match. + if (!Swapped && J1->isCommutative() && !SomeOpMatched && + J1->getOperand(!j) == Op2) { + Swapped = true; + } else { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << " (operand " << j << ")\n"); + MatchFailed = true; + break; + } + } + + SomeOpMatched = true; + } + } + + if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) || + (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << " (uses outside loop)\n"); + MatchFailed = true; + break; + } + + if (!MatchFailed) + BaseMap.insert(std::pair<Value *, Value *>(J2, J1)); + + AllRootUses.insert(J2); + Reductions.recordPair(J1, J2, i+1); + + ++J2; + } + } + + if (MatchFailed) + return false; + + DEBUG(dbgs() << "LRR: Matched all iteration increments for " << + *RealIV << "\n"); + + DenseSet<Instruction *> LoopIncUseSet; + collectInLoopUserSet(L, LoopIncs, SmallInstructionSet(), + SmallInstructionSet(), LoopIncUseSet); + DEBUG(dbgs() << "LRR: Loop increment set size: " << + LoopIncUseSet.size() << "\n"); + + // Make sure that all instructions in the loop have been included in some + // use set. + for (BasicBlock::iterator J = Header->begin(), JE = Header->end(); + J != JE; ++J) { + if (isa<DbgInfoIntrinsic>(J)) + continue; + if (cast<Instruction>(J) == RealIV) + continue; + if (cast<Instruction>(J) == IV) + continue; + if (BaseUseSet.count(J) || AllRootUses.count(J) || + (LoopIncUseSet.count(J) && (J->isTerminator() || + isSafeToSpeculativelyExecute(J, DL)))) + continue; + + if (AllRoots.count(J)) + continue; + + if (Reductions.isSelectedPHI(J)) + continue; + + DEBUG(dbgs() << "LRR: aborting reroll based on " << *RealIV << + " unprocessed instruction found: " << *J << "\n"); + MatchFailed = true; + break; + } + + if (MatchFailed) + return false; + + DEBUG(dbgs() << "LRR: all instructions processed from " << + *RealIV << "\n"); + + if (!Reductions.validateSelected()) + return false; + + // At this point, we've validated the rerolling, and we're committed to + // making changes! + + Reductions.replaceSelected(); + + // Remove instructions associated with non-base iterations. + for (BasicBlock::reverse_iterator J = Header->rbegin(); + J != Header->rend();) { + if (AllRootUses.count(&*J)) { + Instruction *D = &*J; + DEBUG(dbgs() << "LRR: removing: " << *D << "\n"); + D->eraseFromParent(); + continue; + } + + ++J; + } + + // Insert the new induction variable. + const SCEV *Start = RealIVSCEV->getStart(); + if (Inc == 1) + Start = SE->getMulExpr(Start, + SE->getConstant(Start->getType(), Scale)); + const SCEVAddRecExpr *H = + cast<SCEVAddRecExpr>(SE->getAddRecExpr(Start, + SE->getConstant(RealIVSCEV->getType(), 1), + L, SCEV::FlagAnyWrap)); + { // Limit the lifetime of SCEVExpander. + SCEVExpander Expander(*SE, "reroll"); + PHINode *NewIV = + cast<PHINode>(Expander.expandCodeFor(H, IV->getType(), + Header->begin())); + for (DenseSet<Instruction *>::iterator J = BaseUseSet.begin(), + JE = BaseUseSet.end(); J != JE; ++J) + (*J)->replaceUsesOfWith(IV, NewIV); + + if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) { + if (LoopIncUseSet.count(BI)) { + const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); + if (Inc == 1) + ICSCEV = + SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale)); + Value *IC; + if (isa<SCEVConstant>(ICSCEV)) { + IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(), BI); + } else { + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + Preheader = InsertPreheaderForLoop(L, this); + + IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(), + Preheader->getTerminator()); + } + + Value *NewIVNext = NewIV->getIncomingValueForBlock(Header); + Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIVNext, IC, + "exitcond"); + BI->setCondition(Cond); + + if (BI->getSuccessor(1) != Header) + BI->swapSuccessors(); + } + } + } + + SimplifyInstructionsInBlock(Header, DL, TLI); + DeleteDeadPHIs(Header, TLI); + ++NumRerolledLoops; + return true; +} + +bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { + AA = &getAnalysis<AliasAnalysis>(); + LI = &getAnalysis<LoopInfo>(); + SE = &getAnalysis<ScalarEvolution>(); + TLI = &getAnalysis<TargetLibraryInfo>(); + DL = getAnalysisIfAvailable<DataLayout>(); + DT = &getAnalysis<DominatorTree>(); + + BasicBlock *Header = L->getHeader(); + DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << + "] Loop %" << Header->getName() << " (" << + L->getNumBlocks() << " block(s))\n"); + + bool Changed = false; + + // For now, we'll handle only single BB loops. + if (L->getNumBlocks() > 1) + return Changed; + + if (!SE->hasLoopInvariantBackedgeTakenCount(L)) + return Changed; + + const SCEV *LIBETC = SE->getBackedgeTakenCount(L); + const SCEV *IterCount = + SE->getAddExpr(LIBETC, SE->getConstant(LIBETC->getType(), 1)); + DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n"); + + // First, we need to find the induction variable with respect to which we can + // reroll (there may be several possible options). + SmallInstructionVector PossibleIVs; + collectPossibleIVs(L, PossibleIVs); + + if (PossibleIVs.empty()) { + DEBUG(dbgs() << "LRR: No possible IVs found\n"); + return Changed; + } + + ReductionTracker Reductions; + collectPossibleReductions(L, Reductions); + + // For each possible IV, collect the associated possible set of 'root' nodes + // (i+1, i+2, etc.). + for (SmallInstructionVector::iterator I = PossibleIVs.begin(), + IE = PossibleIVs.end(); I != IE; ++I) + if (reroll(*I, L, Header, IterCount, Reductions)) { + Changed = true; + break; + } + + return Changed; +} + diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 73e44d7..eff5268 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -774,6 +774,16 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { } namespace { +class LSRUse; +} +// Check if it is legal to fold 2 base registers. +static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU, + const Formula &F); +// Get the cost of the scaling factor used in F for LU. +static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, + const LSRUse &LU, const Formula &F); + +namespace { /// Cost - This class is used to measure and compare candidate formulae. class Cost { @@ -785,11 +795,12 @@ class Cost { unsigned NumBaseAdds; unsigned ImmCost; unsigned SetupCost; + unsigned ScaleCost; public: Cost() : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0), - SetupCost(0) {} + SetupCost(0), ScaleCost(0) {} bool operator<(const Cost &Other) const; @@ -799,9 +810,9 @@ public: // Once any of the metrics loses, they must all remain losers. bool isValid() { return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds - | ImmCost | SetupCost) != ~0u) + | ImmCost | SetupCost | ScaleCost) != ~0u) || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds - & ImmCost & SetupCost) == ~0u); + & ImmCost & SetupCost & ScaleCost) == ~0u); } #endif @@ -810,12 +821,14 @@ public: return NumRegs == ~0u; } - void RateFormula(const Formula &F, + void RateFormula(const TargetTransformInfo &TTI, + const Formula &F, SmallPtrSet<const SCEV *, 16> &Regs, const DenseSet<const SCEV *> &VisitedRegs, const Loop *L, const SmallVectorImpl<int64_t> &Offsets, ScalarEvolution &SE, DominatorTree &DT, + const LSRUse &LU, SmallPtrSet<const SCEV *, 16> *LoserRegs = 0); void print(raw_ostream &OS) const; @@ -900,12 +913,14 @@ void Cost::RatePrimaryRegister(const SCEV *Reg, } } -void Cost::RateFormula(const Formula &F, +void Cost::RateFormula(const TargetTransformInfo &TTI, + const Formula &F, SmallPtrSet<const SCEV *, 16> &Regs, const DenseSet<const SCEV *> &VisitedRegs, const Loop *L, const SmallVectorImpl<int64_t> &Offsets, ScalarEvolution &SE, DominatorTree &DT, + const LSRUse &LU, SmallPtrSet<const SCEV *, 16> *LoserRegs) { // Tally up the registers. if (const SCEV *ScaledReg = F.ScaledReg) { @@ -932,7 +947,12 @@ void Cost::RateFormula(const Formula &F, // Determine how many (unfolded) adds we'll need inside the loop. size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0); if (NumBaseParts > 1) - NumBaseAdds += NumBaseParts - 1; + // Do not count the base and a possible second register if the target + // allows to fold 2 registers. + NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F)); + + // Accumulate non-free scaling amounts. + ScaleCost += getScalingFactorCost(TTI, LU, F); // Tally up the non-zero immediates. for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), @@ -955,6 +975,7 @@ void Cost::Loose() { NumBaseAdds = ~0u; ImmCost = ~0u; SetupCost = ~0u; + ScaleCost = ~0u; } /// operator< - Choose the lower cost. @@ -967,6 +988,8 @@ bool Cost::operator<(const Cost &Other) const { return NumIVMuls < Other.NumIVMuls; if (NumBaseAdds != Other.NumBaseAdds) return NumBaseAdds < Other.NumBaseAdds; + if (ScaleCost != Other.ScaleCost) + return ScaleCost < Other.ScaleCost; if (ImmCost != Other.ImmCost) return ImmCost < Other.ImmCost; if (SetupCost != Other.SetupCost) @@ -983,6 +1006,8 @@ void Cost::print(raw_ostream &OS) const { if (NumBaseAdds != 0) OS << ", plus " << NumBaseAdds << " base add" << (NumBaseAdds == 1 ? "" : "s"); + if (ScaleCost != 0) + OS << ", plus " << ScaleCost << " scale cost"; if (ImmCost != 0) OS << ", plus " << ImmCost << " imm cost"; if (SetupCost != 0) @@ -1145,6 +1170,13 @@ public: /// may be used. bool AllFixupsOutsideLoop; + /// RigidFormula is set to true to guarantee that this use will be associated + /// with a single formula--the one that initially matched. Some SCEV + /// expressions cannot be expanded. This allows LSR to consider the registers + /// used by those expressions without the need to expand them later after + /// changing the formula. + bool RigidFormula; + /// WidestFixupType - This records the widest use type for any fixup using /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different /// max fixup widths to be equivalent, because the narrower one may be relying @@ -1163,6 +1195,7 @@ public: MinOffset(INT64_MAX), MaxOffset(INT64_MIN), AllFixupsOutsideLoop(true), + RigidFormula(false), WidestFixupType(0) {} bool HasFormulaWithSameRegs(const Formula &F) const; @@ -1189,6 +1222,9 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { /// InsertFormula - If the given formula has not yet been inserted, add it to /// the list, and return true. Return false otherwise. bool LSRUse::InsertFormula(const Formula &F) { + if (!Formulae.empty() && RigidFormula) + return false; + SmallVector<const SCEV *, 4> Key = F.BaseRegs; if (F.ScaledReg) Key.push_back(F.ScaledReg); // Unstable sort by host order ok, because this is only used for uniquifying. @@ -1359,6 +1395,66 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, F.BaseOffset, F.HasBaseReg, F.Scale); } +static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU, + const Formula &F) { + // If F is used as an Addressing Mode, it may fold one Base plus one + // scaled register. If the scaled register is nil, do as if another + // element of the base regs is a 1-scaled register. + // This is possible if BaseRegs has at least 2 registers. + + // If this is not an address calculation, this is not an addressing mode + // use. + if (LU.Kind != LSRUse::Address) + return false; + + // F is already scaled. + if (F.Scale != 0) + return false; + + // We need to keep one register for the base and one to scale. + if (F.BaseRegs.size() < 2) + return false; + + return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, + F.BaseGV, F.BaseOffset, F.HasBaseReg, 1); + } + +static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, + const LSRUse &LU, const Formula &F) { + if (!F.Scale) + return 0; + assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, + LU.AccessTy, F) && "Illegal formula in use."); + + switch (LU.Kind) { + case LSRUse::Address: { + // Check the scaling factor cost with both the min and max offsets. + int ScaleCostMinOffset = + TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, + F.BaseOffset + LU.MinOffset, + F.HasBaseReg, F.Scale); + int ScaleCostMaxOffset = + TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, + F.BaseOffset + LU.MaxOffset, + F.HasBaseReg, F.Scale); + + assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 && + "Legal addressing mode has an illegal cost!"); + return std::max(ScaleCostMinOffset, ScaleCostMaxOffset); + } + case LSRUse::ICmpZero: + // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg. + // Therefore, return 0 in case F.Scale == -1. + return F.Scale != -1; + + case LSRUse::Basic: + case LSRUse::Special: + return 0; + } + + llvm_unreachable("Invalid LSRUse Kind!"); +} + static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, @@ -1664,7 +1760,7 @@ void LSRInstance::OptimizeShadowIV() { IVUsers::const_iterator CandidateUI = UI; ++UI; Instruction *ShadowUse = CandidateUI->getUser(); - Type *DestTy = NULL; + Type *DestTy = 0; bool IsSigned = false; /* If shadow use is a int->float cast then insert a second IV @@ -1726,7 +1822,7 @@ void LSRInstance::OptimizeShadowIV() { continue; /* Initialize new IV, double d = 0.0 in above example. */ - ConstantInt *C = NULL; + ConstantInt *C = 0; if (Incr->getOperand(0) == PH) C = dyn_cast<ConstantInt>(Incr->getOperand(1)); else if (Incr->getOperand(1) == PH) @@ -2858,7 +2954,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { // x == y --> x - y == 0 const SCEV *N = SE.getSCEV(NV); - if (SE.isLoopInvariant(N, L) && isSafeToExpand(N)) { + if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) { // S is normalized, so normalize N before folding it into S // to keep the result normalized. N = TransformForPostIncUse(Normalize, N, CI, 0, @@ -2901,6 +2997,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { /// and loop-computable portions. void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { + // Mark uses whose expressions cannot be expanded. + if (!isSafeToExpand(S, SE)) + LU.RigidFormula = true; + Formula F; F.InitialMatch(S, L, SE); bool Inserted = InsertFormula(LU, LUIdx, F); @@ -3048,7 +3148,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, if (Remainder) Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); } - return NULL; + return 0; } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { // Split a non-zero base out of an addrec. if (AR->getStart()->isZero()) @@ -3060,7 +3160,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, // does not pertain to this loop. if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) { Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); - Remainder = NULL; + Remainder = 0; } if (Remainder != AR->getStart()) { if (!Remainder) @@ -3082,7 +3182,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1); if (Remainder) Ops.push_back(SE.getMulExpr(C, Remainder)); - return NULL; + return 0; } } return S; @@ -3607,7 +3707,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { abs64(NewF.BaseOffset)) && (C->getValue()->getValue() + NewF.BaseOffset).countTrailingZeros() >= - CountTrailingZeros_64(NewF.BaseOffset)) + countTrailingZeros<uint64_t>(NewF.BaseOffset)) goto skip_formula; // Ok, looks good. @@ -3690,7 +3790,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { // the corresponding bad register from the Regs set. Cost CostF; Regs.clear(); - CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, + CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU, &LoserRegs); if (CostF.isLoser()) { // During initial formula generation, undesirable formulae are generated @@ -3726,7 +3826,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { Cost CostBest; Regs.clear(); - CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT); + CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE, + DT, LU); if (CostF < CostBest) std::swap(F, Best); DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); @@ -4079,7 +4180,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, // the current best, prune the search at that point. NewCost = CurCost; NewRegs = CurRegs; - NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT); + NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT, + LU); if (NewCost < SolutionCost) { Workspace.push_back(&F); if (Workspace.size() != Uses.size()) { @@ -4266,6 +4368,8 @@ Value *LSRInstance::Expand(const LSRFixup &LF, SCEVExpander &Rewriter, SmallVectorImpl<WeakVH> &DeadInsts) const { const LSRUse &LU = Uses[LF.LUIdx]; + if (LU.RigidFormula) + return LF.OperandValToReplace; // Determine an input position which will be dominated by the operands and // which will dominate the result. diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 80d060b..08ac38d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -49,12 +49,17 @@ namespace { class LoopUnroll : public LoopPass { public: static char ID; // Pass ID, replacement for typeid - LoopUnroll(int T = -1, int C = -1, int P = -1) : LoopPass(ID) { + LoopUnroll(int T = -1, int C = -1, int P = -1, int R = -1) : LoopPass(ID) { CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T); CurrentCount = (C == -1) ? UnrollCount : unsigned(C); CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P; + CurrentRuntime = (R == -1) ? UnrollRuntime : (bool)R; UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0); + UserAllowPartial = (P != -1) || + (UnrollAllowPartial.getNumOccurrences() > 0); + UserRuntime = (R != -1) || (UnrollRuntime.getNumOccurrences() > 0); + UserCount = (C != -1) || (UnrollCount.getNumOccurrences() > 0); initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); } @@ -75,7 +80,11 @@ namespace { unsigned CurrentCount; unsigned CurrentThreshold; bool CurrentAllowPartial; + bool CurrentRuntime; + bool UserCount; // CurrentCount is user-specified. bool UserThreshold; // CurrentThreshold is user-specified. + bool UserAllowPartial; // CurrentAllowPartial is user-specified. + bool UserRuntime; // CurrentRuntime is user-specified. bool runOnLoop(Loop *L, LPPassManager &LPM); @@ -110,8 +119,9 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) -Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) { - return new LoopUnroll(Threshold, Count, AllowPartial); +Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, + int Runtime) { + return new LoopUnroll(Threshold, Count, AllowPartial, Runtime); } /// ApproximateLoopSize - Approximate the size of the loop. @@ -145,16 +155,24 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { << "] Loop %" << Header->getName() << "\n"); (void)Header; + TargetTransformInfo::UnrollingPreferences UP; + UP.Threshold = CurrentThreshold; + UP.OptSizeThreshold = OptSizeUnrollThreshold; + UP.Count = CurrentCount; + UP.Partial = CurrentAllowPartial; + UP.Runtime = CurrentRuntime; + TTI.getUnrollingPreferences(L, UP); + // Determine the current unrolling threshold. While this is normally set // from UnrollThreshold, it is overridden to a smaller value if the current // function is marked as optimize-for-size, and the unroll threshold was // not user specified. - unsigned Threshold = CurrentThreshold; + unsigned Threshold = UserThreshold ? CurrentThreshold : UP.Threshold; if (!UserThreshold && Header->getParent()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize)) - Threshold = OptSizeUnrollThreshold; + Threshold = UP.OptSizeThreshold; // Find trip count and trip multiple if count is not available unsigned TripCount = 0; @@ -167,11 +185,14 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { TripCount = SE->getSmallConstantTripCount(L, LatchBlock); TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock); } + + bool Runtime = UserRuntime ? CurrentRuntime : UP.Runtime; + // Use a default unroll-count if the user doesn't specify a value // and the trip count is a run-time value. The default is different // for run-time or compile-time trip count loops. - unsigned Count = CurrentCount; - if (UnrollRuntime && CurrentCount == 0 && TripCount == 0) + unsigned Count = UserCount ? CurrentCount : UP.Count; + if (Runtime && Count == 0 && TripCount == 0) Count = UnrollRuntimeCount; if (Count == 0) { @@ -204,7 +225,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { if (TripCount != 1 && Size > Threshold) { DEBUG(dbgs() << " Too large to fully unroll with count: " << Count << " because size: " << Size << ">" << Threshold << "\n"); - if (!CurrentAllowPartial && !(UnrollRuntime && TripCount == 0)) { + bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial; + if (!AllowPartial && !(Runtime && TripCount == 0)) { DEBUG(dbgs() << " will not try to unroll partially because " << "-unroll-allow-partial not given\n"); return false; @@ -215,7 +237,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { while (Count != 0 && TripCount%Count != 0) Count--; } - else if (UnrollRuntime) { + else if (Runtime) { // Reduce unroll count to be a lower power-of-two value while (Count != 0 && Size > Threshold) { Count >>= 1; @@ -231,7 +253,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } // Unroll the loop. - if (!UnrollLoop(L, Count, TripCount, UnrollRuntime, TripMultiple, LI, &LPM)) + if (!UnrollLoop(L, Count, TripCount, Runtime, TripMultiple, LI, &LPM)) return false; return true; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index 0e8199f..c4ebfd5 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -87,8 +87,8 @@ namespace { typedef LoopPropsMap::iterator LoopPropsMapIt; LoopPropsMap LoopsProperties; - UnswitchedValsMap* CurLoopInstructions; - LoopProperties* CurrentLoopProperties; + UnswitchedValsMap *CurLoopInstructions; + LoopProperties *CurrentLoopProperties; // Max size of code we can produce on remained iterations. unsigned MaxSize; @@ -96,30 +96,30 @@ namespace { public: LUAnalysisCache() : - CurLoopInstructions(NULL), CurrentLoopProperties(NULL), + CurLoopInstructions(0), CurrentLoopProperties(0), MaxSize(Threshold) {} // Analyze loop. Check its size, calculate is it possible to unswitch // it. Returns true if we can unswitch this loop. - bool countLoop(const Loop* L, const TargetTransformInfo &TTI); + bool countLoop(const Loop *L, const TargetTransformInfo &TTI); // Clean all data related to given loop. - void forgetLoop(const Loop* L); + void forgetLoop(const Loop *L); // Mark case value as unswitched. // Since SI instruction can be partly unswitched, in order to avoid // extra unswitching in cloned loops keep track all unswitched values. - void setUnswitched(const SwitchInst* SI, const Value* V); + void setUnswitched(const SwitchInst *SI, const Value *V); // Check was this case value unswitched before or not. - bool isUnswitched(const SwitchInst* SI, const Value* V); + bool isUnswitched(const SwitchInst *SI, const Value *V); // Clone all loop-unswitch related loop properties. // Redistribute unswitching quotas. // Note, that new loop data is stored inside the VMap. - void cloneData(const Loop* NewLoop, const Loop* OldLoop, - const ValueToValueMapTy& VMap); + void cloneData(const Loop *NewLoop, const Loop *OldLoop, + const ValueToValueMapTy &VMap); }; class LoopUnswitch : public LoopPass { @@ -151,8 +151,8 @@ namespace { static char ID; // Pass ID, replacement for typeid explicit LoopUnswitch(bool Os = false) : LoopPass(ID), OptimizeForSize(Os), redoLoop(false), - currentLoop(NULL), DT(NULL), loopHeader(NULL), - loopPreheader(NULL) { + currentLoop(0), DT(0), loopHeader(0), + loopPreheader(0) { initializeLoopUnswitchPass(*PassRegistry::getPassRegistry()); } @@ -196,7 +196,7 @@ namespace { /// Split all of the edges from inside the loop to their exit blocks. /// Update the appropriate Phi nodes as we do so. - void SplitExitEdges(Loop *L, const SmallVector<BasicBlock *, 8> &ExitBlocks); + void SplitExitEdges(Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks); bool UnswitchIfProfitable(Value *LoopCond, Constant *Val); void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, @@ -212,8 +212,6 @@ namespace { Instruction *InsertPt); void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L); - void RemoveBlockIfDead(BasicBlock *BB, - std::vector<Instruction*> &Worklist, Loop *l); void RemoveLoopFromHierarchy(Loop *L); bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = 0, BasicBlock **LoopExit = 0); @@ -225,12 +223,14 @@ namespace { // it. Returns true if we can unswitch this loop. bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) { - std::pair<LoopPropsMapIt, bool> InsertRes = + LoopPropsMapIt PropsIt; + bool Inserted; + llvm::tie(PropsIt, Inserted) = LoopsProperties.insert(std::make_pair(L, LoopProperties())); - LoopProperties& Props = InsertRes.first->second; + LoopProperties &Props = PropsIt->second; - if (InsertRes.second) { + if (Inserted) { // New loop. // Limit the number of instructions to avoid causing significant code @@ -242,8 +242,7 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) { // consideration code simplification opportunities and code that can // be shared by the resultant unswitched loops. CodeMetrics Metrics; - for (Loop::block_iterator I = L->block_begin(), - E = L->block_end(); + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) Metrics.analyzeBasicBlock(*I, TTI); @@ -253,17 +252,16 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) { if (Metrics.notDuplicatable) { DEBUG(dbgs() << "NOT unswitching loop %" - << L->getHeader()->getName() << ", contents cannot be " - << "duplicated!\n"); + << L->getHeader()->getName() << ", contents cannot be " + << "duplicated!\n"); return false; } } if (!Props.CanBeUnswitchedCount) { DEBUG(dbgs() << "NOT unswitching loop %" - << L->getHeader()->getName() << ", cost too high: " - << L->getBlocks().size() << "\n"); - + << L->getHeader()->getName() << ", cost too high: " + << L->getBlocks().size() << "\n"); return false; } @@ -275,41 +273,41 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) { } // Clean all data related to given loop. -void LUAnalysisCache::forgetLoop(const Loop* L) { +void LUAnalysisCache::forgetLoop(const Loop *L) { LoopPropsMapIt LIt = LoopsProperties.find(L); if (LIt != LoopsProperties.end()) { - LoopProperties& Props = LIt->second; + LoopProperties &Props = LIt->second; MaxSize += Props.CanBeUnswitchedCount * Props.SizeEstimation; LoopsProperties.erase(LIt); } - CurrentLoopProperties = NULL; - CurLoopInstructions = NULL; + CurrentLoopProperties = 0; + CurLoopInstructions = 0; } // Mark case value as unswitched. // Since SI instruction can be partly unswitched, in order to avoid // extra unswitching in cloned loops keep track all unswitched values. -void LUAnalysisCache::setUnswitched(const SwitchInst* SI, const Value* V) { +void LUAnalysisCache::setUnswitched(const SwitchInst *SI, const Value *V) { (*CurLoopInstructions)[SI].insert(V); } // Check was this case value unswitched before or not. -bool LUAnalysisCache::isUnswitched(const SwitchInst* SI, const Value* V) { +bool LUAnalysisCache::isUnswitched(const SwitchInst *SI, const Value *V) { return (*CurLoopInstructions)[SI].count(V); } // Clone all loop-unswitch related loop properties. // Redistribute unswitching quotas. // Note, that new loop data is stored inside the VMap. -void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop, - const ValueToValueMapTy& VMap) { +void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop, + const ValueToValueMapTy &VMap) { - LoopProperties& NewLoopProps = LoopsProperties[NewLoop]; - LoopProperties& OldLoopProps = *CurrentLoopProperties; - UnswitchedValsMap& Insts = OldLoopProps.UnswitchedVals; + LoopProperties &NewLoopProps = LoopsProperties[NewLoop]; + LoopProperties &OldLoopProps = *CurrentLoopProperties; + UnswitchedValsMap &Insts = OldLoopProps.UnswitchedVals; // Reallocate "can-be-unswitched quota" @@ -324,9 +322,9 @@ void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop, // for new loop switches we clone info about values that was // already unswitched and has redundant successors. for (UnswitchedValsIt I = Insts.begin(); I != Insts.end(); ++I) { - const SwitchInst* OldInst = I->first; - Value* NewI = VMap.lookup(OldInst); - const SwitchInst* NewInst = cast_or_null<SwitchInst>(NewI); + const SwitchInst *OldInst = I->first; + Value *NewI = VMap.lookup(OldInst); + const SwitchInst *NewInst = cast_or_null<SwitchInst>(NewI); assert(NewInst && "All instructions that are in SrcBB must be in VMap."); NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst]; @@ -458,14 +456,14 @@ bool LoopUnswitch::processCurrentLoop() { // Find a value to unswitch on: // FIXME: this should chose the most expensive case! // FIXME: scan for a case with a non-critical edge? - Constant *UnswitchVal = NULL; + Constant *UnswitchVal = 0; // Do not process same value again and again. // At this point we have some cases already unswitched and // some not yet unswitched. Let's find the first not yet unswitched one. for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) { - Constant* UnswitchValCandidate = i.getCaseValue(); + Constant *UnswitchValCandidate = i.getCaseValue(); if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) { UnswitchVal = UnswitchValCandidate; break; @@ -511,7 +509,8 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB, // Already visited. Without more analysis, this could indicate an infinite // loop. return false; - } else if (!L->contains(BB)) { + } + if (!L->contains(BB)) { // Otherwise, this is a loop exit, this is fine so long as this is the // first exit. if (ExitBB != 0) return false; @@ -595,11 +594,11 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val, // on already unswitched cases. for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) { - BasicBlock* LoopExitCandidate; + BasicBlock *LoopExitCandidate; if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop, i.getCaseSuccessor()))) { // Okay, we found a trivial case, remember the value that is trivial. - ConstantInt* CaseVal = i.getCaseValue(); + ConstantInt *CaseVal = i.getCaseValue(); // Check that it was not unswitched before, since already unswitched // trivial vals are looks trivial too. @@ -752,7 +751,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, /// SplitExitEdges - Split all of the edges from inside the loop to their exit /// blocks. Update the appropriate Phi nodes as we do so. void LoopUnswitch::SplitExitEdges(Loop *L, - const SmallVector<BasicBlock *, 8> &ExitBlocks){ + const SmallVectorImpl<BasicBlock *> &ExitBlocks){ for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { BasicBlock *ExitBlock = ExitBlocks[i]; @@ -854,9 +853,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, // If the successor of the exit block had PHI nodes, add an entry for // NewExit. - PHINode *PN; - for (BasicBlock::iterator I = ExitSucc->begin(); isa<PHINode>(I); ++I) { - PN = cast<PHINode>(I); + for (BasicBlock::iterator I = ExitSucc->begin(); + PHINode *PN = dyn_cast<PHINode>(I); ++I) { Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]); ValueToValueMapTy::iterator It = VMap.find(V); if (It != VMap.end()) V = It->second; @@ -864,8 +862,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, } if (LandingPadInst *LPad = NewExit->getLandingPadInst()) { - PN = PHINode::Create(LPad->getType(), 0, "", - ExitSucc->getFirstInsertionPt()); + PHINode *PN = PHINode::Create(LPad->getType(), 0, "", + ExitSucc->getFirstInsertionPt()); for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc); I != E; ++I) { @@ -946,117 +944,6 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V, ++NumSimplify; } -/// RemoveBlockIfDead - If the specified block is dead, remove it, update loop -/// information, and remove any dead successors it has. -/// -void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB, - std::vector<Instruction*> &Worklist, - Loop *L) { - if (pred_begin(BB) != pred_end(BB)) { - // This block isn't dead, since an edge to BB was just removed, see if there - // are any easy simplifications we can do now. - if (BasicBlock *Pred = BB->getSinglePredecessor()) { - // If it has one pred, fold phi nodes in BB. - while (isa<PHINode>(BB->begin())) - ReplaceUsesOfWith(BB->begin(), - cast<PHINode>(BB->begin())->getIncomingValue(0), - Worklist, L, LPM); - - // If this is the header of a loop and the only pred is the latch, we now - // have an unreachable loop. - if (Loop *L = LI->getLoopFor(BB)) - if (loopHeader == BB && L->contains(Pred)) { - // Remove the branch from the latch to the header block, this makes - // the header dead, which will make the latch dead (because the header - // dominates the latch). - LPM->deleteSimpleAnalysisValue(Pred->getTerminator(), L); - Pred->getTerminator()->eraseFromParent(); - new UnreachableInst(BB->getContext(), Pred); - - // The loop is now broken, remove it from LI. - RemoveLoopFromHierarchy(L); - - // Reprocess the header, which now IS dead. - RemoveBlockIfDead(BB, Worklist, L); - return; - } - - // If pred ends in a uncond branch, add uncond branch to worklist so that - // the two blocks will get merged. - if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator())) - if (BI->isUnconditional()) - Worklist.push_back(BI); - } - return; - } - - DEBUG(dbgs() << "Nuking dead block: " << *BB); - - // Remove the instructions in the basic block from the worklist. - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { - RemoveFromWorklist(I, Worklist); - - // Anything that uses the instructions in this basic block should have their - // uses replaced with undefs. - // If I is not void type then replaceAllUsesWith undef. - // This allows ValueHandlers and custom metadata to adjust itself. - if (!I->getType()->isVoidTy()) - I->replaceAllUsesWith(UndefValue::get(I->getType())); - } - - // If this is the edge to the header block for a loop, remove the loop and - // promote all subloops. - if (Loop *BBLoop = LI->getLoopFor(BB)) { - if (BBLoop->getLoopLatch() == BB) { - RemoveLoopFromHierarchy(BBLoop); - if (currentLoop == BBLoop) { - currentLoop = 0; - redoLoop = false; - } - } - } - - // Remove the block from the loop info, which removes it from any loops it - // was in. - LI->removeBlock(BB); - - - // Remove phi node entries in successors for this block. - TerminatorInst *TI = BB->getTerminator(); - SmallVector<BasicBlock*, 4> Succs; - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { - Succs.push_back(TI->getSuccessor(i)); - TI->getSuccessor(i)->removePredecessor(BB); - } - - // Unique the successors, remove anything with multiple uses. - array_pod_sort(Succs.begin(), Succs.end()); - Succs.erase(std::unique(Succs.begin(), Succs.end()), Succs.end()); - - // Remove the basic block, including all of the instructions contained in it. - LPM->deleteSimpleAnalysisValue(BB, L); - BB->eraseFromParent(); - // Remove successor blocks here that are not dead, so that we know we only - // have dead blocks in this list. Nondead blocks have a way of becoming dead, - // then getting removed before we revisit them, which is badness. - // - for (unsigned i = 0; i != Succs.size(); ++i) - if (pred_begin(Succs[i]) != pred_end(Succs[i])) { - // One exception is loop headers. If this block was the preheader for a - // loop, then we DO want to visit the loop so the loop gets deleted. - // We know that if the successor is a loop header, that this loop had to - // be the preheader: the case where this was the latch block was handled - // above and headers can only have two predecessors. - if (!LI->isLoopHeader(Succs[i])) { - Succs.erase(Succs.begin()+i); - --i; - } - } - - for (unsigned i = 0, e = Succs.size(); i != e; ++i) - RemoveBlockIfDead(Succs[i], Worklist, L); -} - /// RemoveLoopFromHierarchy - We have discovered that the specified loop has /// become unwrapped, either because the backedge was deleted, or because the /// edge into the header was removed. If the edge into the header from the @@ -1088,7 +975,6 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, std::vector<Instruction*> Worklist; LLVMContext &Context = Val->getContext(); - // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC // in the loop with the appropriate one directly. if (IsEqual || (isa<ConstantInt>(Val) && @@ -1108,8 +994,8 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, Worklist.push_back(U); } - for (std::vector<Instruction*>::iterator UI = Worklist.begin(); - UI != Worklist.end(); ++UI) + for (std::vector<Instruction*>::iterator UI = Worklist.begin(), + UE = Worklist.end(); UI != UE; ++UI) (*UI)->replaceUsesOfWith(LIC, Replacement); SimplifyCode(Worklist, L); @@ -1266,23 +1152,6 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { continue; } - if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){ - // Conditional branch. Turn it into an unconditional branch, then - // remove dead blocks. - continue; // FIXME: Enable. - - DEBUG(dbgs() << "Folded branch: " << *BI); - BasicBlock *DeadSucc = BI->getSuccessor(CB->getZExtValue()); - BasicBlock *LiveSucc = BI->getSuccessor(!CB->getZExtValue()); - DeadSucc->removePredecessor(BI->getParent(), true); - Worklist.push_back(BranchInst::Create(LiveSucc, BI)); - LPM->deleteSimpleAnalysisValue(BI, L); - BI->eraseFromParent(); - RemoveFromWorklist(BI, Worklist); - ++NumSimplify; - - RemoveBlockIfDead(DeadSucc, Worklist, L); - } continue; } } diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index be0f0e8..9912d3d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -170,14 +170,17 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const { // pessimize the llvm optimizer. // // Since we don't have perfect knowledge here, make some assumptions: assume - // the maximum GPR width is the same size as the pointer size and assume that - // this width can be stored. If so, check to see whether we will end up - // actually reducing the number of stores used. + // the maximum GPR width is the same size as the largest legal integer + // size. If so, check to see whether we will end up actually reducing the + // number of stores used. unsigned Bytes = unsigned(End-Start); - unsigned NumPointerStores = Bytes/TD.getPointerSize(); + unsigned MaxIntSize = TD.getLargestLegalIntTypeSize(); + if (MaxIntSize == 0) + MaxIntSize = 1; + unsigned NumPointerStores = Bytes / MaxIntSize; // Assume the remaining bytes if any are done a byte at a time. - unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize(); + unsigned NumByteStores = Bytes - NumPointerStores * MaxIntSize; // If we will reduce the # stores (according to this heuristic), do the // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 @@ -465,7 +468,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); // Zap all the stores. - for (SmallVector<Instruction*, 16>::const_iterator + for (SmallVectorImpl<Instruction *>::const_iterator SI = Range.TheStores.begin(), SE = Range.TheStores.end(); SI != SE; ++SI) { MD->removeInstruction(*SI); @@ -626,8 +629,14 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, return false; Type *StructTy = cast<PointerType>(A->getType())->getElementType(); - uint64_t destSize = TD->getTypeAllocSize(StructTy); + if (!StructTy->isSized()) { + // The call may never return and hence the copy-instruction may never + // be executed, and therefore it's not safe to say "the destination + // has at least <cpyLen> bytes, as implied by the copy-instruction", + return false; + } + uint64_t destSize = TD->getTypeAllocSize(StructTy); if (destSize < srcSize) return false; } else { diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp new file mode 100644 index 0000000..15cee44 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -0,0 +1,156 @@ +//===--- PartiallyInlineLibCalls.cpp - Partially inline libcalls ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass tries to partially inline the fast path of well-known library +// functions, such as using square-root instructions for cases where sqrt() +// does not need to set errno. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "partially-inline-libcalls" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +namespace { + class PartiallyInlineLibCalls : public FunctionPass { + public: + static char ID; + + PartiallyInlineLibCalls() : + FunctionPass(ID) { + initializePartiallyInlineLibCallsPass(*PassRegistry::getPassRegistry()); + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + virtual bool runOnFunction(Function &F); + + private: + /// Optimize calls to sqrt. + bool optimizeSQRT(CallInst *Call, Function *CalledFunc, + BasicBlock &CurrBB, Function::iterator &BB); + }; + + char PartiallyInlineLibCalls::ID = 0; +} + +INITIALIZE_PASS(PartiallyInlineLibCalls, "partially-inline-libcalls", + "Partially inline calls to library functions", false, false) + +void PartiallyInlineLibCalls::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetTransformInfo>(); + FunctionPass::getAnalysisUsage(AU); +} + +bool PartiallyInlineLibCalls::runOnFunction(Function &F) { + bool Changed = false; + Function::iterator CurrBB; + TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfo>(); + for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) { + CurrBB = BB++; + + for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end(); + II != IE; ++II) { + CallInst *Call = dyn_cast<CallInst>(&*II); + Function *CalledFunc; + + if (!Call || !(CalledFunc = Call->getCalledFunction())) + continue; + + // Skip if function either has local linkage or is not a known library + // function. + LibFunc::Func LibFunc; + if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() || + !TLI->getLibFunc(CalledFunc->getName(), LibFunc)) + continue; + + switch (LibFunc) { + case LibFunc::sqrtf: + case LibFunc::sqrt: + if (TTI->haveFastSqrt(Call->getType()) && + optimizeSQRT(Call, CalledFunc, *CurrBB, BB)) + break; + continue; + default: + continue; + } + + Changed = true; + break; + } + } + + return Changed; +} + +bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call, + Function *CalledFunc, + BasicBlock &CurrBB, + Function::iterator &BB) { + // There is no need to change the IR, since backend will emit sqrt + // instruction if the call has already been marked read-only. + if (Call->onlyReadsMemory()) + return false; + + // Do the following transformation: + // + // (before) + // dst = sqrt(src) + // + // (after) + // v0 = sqrt_noreadmem(src) # native sqrt instruction. + // if (v0 is a NaN) + // v1 = sqrt(src) # library call. + // dst = phi(v0, v1) + // + + // Move all instructions following Call to newly created block JoinBB. + // Create phi and replace all uses. + BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode(), this); + IRBuilder<> Builder(JoinBB, JoinBB->begin()); + PHINode *Phi = Builder.CreatePHI(Call->getType(), 2); + Call->replaceAllUsesWith(Phi); + + // Create basic block LibCallBB and insert a call to library function sqrt. + BasicBlock *LibCallBB = BasicBlock::Create(CurrBB.getContext(), "call.sqrt", + CurrBB.getParent(), JoinBB); + Builder.SetInsertPoint(LibCallBB); + Instruction *LibCall = Call->clone(); + Builder.Insert(LibCall); + Builder.CreateBr(JoinBB); + + // Add attribute "readnone" so that backend can use a native sqrt instruction + // for this call. Insert a FP compare instruction and a conditional branch + // at the end of CurrBB. + Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone); + CurrBB.getTerminator()->eraseFromParent(); + Builder.SetInsertPoint(&CurrBB); + Value *FCmp = Builder.CreateFCmpOEQ(Call, Call); + Builder.CreateCondBr(FCmp, JoinBB, LibCallBB); + + // Add phi operands. + Phi->addIncoming(Call, &CurrBB); + Phi->addIncoming(LibCall, LibCallBB); + + BB = JoinBB; + return true; +} + +FunctionPass *llvm::createPartiallyInlineLibCallsPass() { + return new PartiallyInlineLibCalls(); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp index a3c241d..328a9c5 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -122,7 +122,6 @@ namespace { class XorOpnd { public: XorOpnd(Value *V); - const XorOpnd &operator=(const XorOpnd &That); bool isInvalid() const { return SymbolicPart == 0; } bool isOrExpr() const { return isOr; } @@ -225,15 +224,6 @@ XorOpnd::XorOpnd(Value *V) { isOr = true; } -const XorOpnd &XorOpnd::operator=(const XorOpnd &That) { - OrigVal = That.OrigVal; - SymbolicPart = That.SymbolicPart; - ConstPart = That.ConstPart; - SymbolicRank = That.SymbolicRank; - isOr = That.isOr; - return *this; -} - char Reassociate::ID = 0; INITIALIZE_PASS(Reassociate, "reassociate", "Reassociate expressions", false, false) @@ -251,21 +241,24 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { } static bool isUnmovableInstruction(Instruction *I) { - if (I->getOpcode() == Instruction::PHI || - I->getOpcode() == Instruction::LandingPad || - I->getOpcode() == Instruction::Alloca || - I->getOpcode() == Instruction::Load || - I->getOpcode() == Instruction::Invoke || - (I->getOpcode() == Instruction::Call && - !isa<DbgInfoIntrinsic>(I)) || - I->getOpcode() == Instruction::UDiv || - I->getOpcode() == Instruction::SDiv || - I->getOpcode() == Instruction::FDiv || - I->getOpcode() == Instruction::URem || - I->getOpcode() == Instruction::SRem || - I->getOpcode() == Instruction::FRem) + switch (I->getOpcode()) { + case Instruction::PHI: + case Instruction::LandingPad: + case Instruction::Alloca: + case Instruction::Load: + case Instruction::Invoke: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: return true; - return false; + case Instruction::Call: + return !isa<DbgInfoIntrinsic>(I); + default: + return false; + } } void Reassociate::BuildRankMap(Function &F) { diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp index e30a274..4364720 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -214,7 +214,7 @@ public: /// This returns true if the block was not considered live before. bool MarkBlockExecutable(BasicBlock *BB) { if (!BBExecutable.insert(BB)) return false; - DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << "\n"); + DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n'); BBWorkList.push_back(BB); // Add the block to the work list! return true; } @@ -427,7 +427,7 @@ private: // feasible that wasn't before. Revisit the PHI nodes in the block // because they have potentially new operands. DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName() - << " -> " << Dest->getName() << "\n"); + << " -> " << Dest->getName() << '\n'); PHINode *PN; for (BasicBlock::iterator I = Dest->begin(); @@ -439,7 +439,7 @@ private: // getFeasibleSuccessors - Return a vector of booleans to indicate which // successors are reachable from a given terminator instruction. // - void getFeasibleSuccessors(TerminatorInst &TI, SmallVector<bool, 16> &Succs); + void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs); // isEdgeFeasible - Return true if the control flow edge from the 'From' basic // block to the 'To' basic block is currently feasible. @@ -501,7 +501,7 @@ private: void visitInstruction(Instruction &I) { // If a new instruction is added to LLVM that we don't handle. - dbgs() << "SCCP: Don't know how to handle: " << I; + dbgs() << "SCCP: Don't know how to handle: " << I << '\n'; markAnythingOverdefined(&I); // Just in case } }; @@ -513,7 +513,7 @@ private: // successors are reachable from a given terminator instruction. // void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, - SmallVector<bool, 16> &Succs) { + SmallVectorImpl<bool> &Succs) { Succs.resize(TI.getNumSuccessors()); if (BranchInst *BI = dyn_cast<BranchInst>(&TI)) { if (BI->isUnconditional()) { @@ -1604,7 +1604,7 @@ bool SCCP::runOnFunction(Function &F) { Constant *Const = IV.isConstant() ? IV.getConstant() : UndefValue::get(Inst->getType()); - DEBUG(dbgs() << " Constant: " << *Const << " = " << *Inst); + DEBUG(dbgs() << " Constant: " << *Const << " = " << *Inst << '\n'); // Replaces all of the uses of a variable with uses of the constant. Inst->replaceAllUsesWith(Const); @@ -1812,7 +1812,7 @@ bool IPSCCP::runOnModule(Module &M) { Constant *Const = IV.isConstant() ? IV.getConstant() : UndefValue::get(Inst->getType()); - DEBUG(dbgs() << " Constant: " << *Const << " = " << *Inst); + DEBUG(dbgs() << " Constant: " << *Const << " = " << *Inst << '\n'); // Replaces all of the uses of a variable with uses of the // constant. diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp index d073e78..9f3fc83 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp @@ -47,6 +47,7 @@ #include "llvm/InstVisitor.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -58,9 +59,9 @@ using namespace llvm; STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed"); -STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions"); -STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses found"); -STATISTIC(MaxPartitionUsesPerAlloca, "Maximum number of partition uses"); +STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca"); +STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten"); +STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition"); STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); @@ -110,17 +111,39 @@ typedef llvm::IRBuilder<false, ConstantFolder, } namespace { -/// \brief A common base class for representing a half-open byte range. -struct ByteRange { +/// \brief A used slice of an alloca. +/// +/// This structure represents a slice of an alloca used by some instruction. It +/// stores both the begin and end offsets of this use, a pointer to the use +/// itself, and a flag indicating whether we can classify the use as splittable +/// or not when forming partitions of the alloca. +class Slice { /// \brief The beginning offset of the range. uint64_t BeginOffset; /// \brief The ending offset, not included in the range. uint64_t EndOffset; - ByteRange() : BeginOffset(), EndOffset() {} - ByteRange(uint64_t BeginOffset, uint64_t EndOffset) - : BeginOffset(BeginOffset), EndOffset(EndOffset) {} + /// \brief Storage for both the use of this slice and whether it can be + /// split. + PointerIntPair<Use *, 1, bool> UseAndIsSplittable; + +public: + Slice() : BeginOffset(), EndOffset() {} + Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable) + : BeginOffset(BeginOffset), EndOffset(EndOffset), + UseAndIsSplittable(U, IsSplittable) {} + + uint64_t beginOffset() const { return BeginOffset; } + uint64_t endOffset() const { return EndOffset; } + + bool isSplittable() const { return UseAndIsSplittable.getInt(); } + void makeUnsplittable() { UseAndIsSplittable.setInt(false); } + + Use *getUse() const { return UseAndIsSplittable.getPointer(); } + + bool isDead() const { return getUse() == 0; } + void kill() { UseAndIsSplittable.setPointer(0); } /// \brief Support for ordering ranges. /// @@ -128,173 +151,67 @@ struct ByteRange { /// always increasing, and within equal start offsets, the end offsets are /// decreasing. Thus the spanning range comes first in a cluster with the /// same start position. - bool operator<(const ByteRange &RHS) const { - if (BeginOffset < RHS.BeginOffset) return true; - if (BeginOffset > RHS.BeginOffset) return false; - if (EndOffset > RHS.EndOffset) return true; + bool operator<(const Slice &RHS) const { + if (beginOffset() < RHS.beginOffset()) return true; + if (beginOffset() > RHS.beginOffset()) return false; + if (isSplittable() != RHS.isSplittable()) return !isSplittable(); + if (endOffset() > RHS.endOffset()) return true; return false; } /// \brief Support comparison with a single offset to allow binary searches. - friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) { - return LHS.BeginOffset < RHSOffset; + friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS, + uint64_t RHSOffset) { + return LHS.beginOffset() < RHSOffset; } - friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, - const ByteRange &RHS) { - return LHSOffset < RHS.BeginOffset; + const Slice &RHS) { + return LHSOffset < RHS.beginOffset(); } - bool operator==(const ByteRange &RHS) const { - return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset; + bool operator==(const Slice &RHS) const { + return isSplittable() == RHS.isSplittable() && + beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset(); } - bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); } + bool operator!=(const Slice &RHS) const { return !operator==(RHS); } }; - -/// \brief A partition of an alloca. -/// -/// This structure represents a contiguous partition of the alloca. These are -/// formed by examining the uses of the alloca. During formation, they may -/// overlap but once an AllocaPartitioning is built, the Partitions within it -/// are all disjoint. -struct Partition : public ByteRange { - /// \brief Whether this partition is splittable into smaller partitions. - /// - /// We flag partitions as splittable when they are formed entirely due to - /// accesses by trivially splittable operations such as memset and memcpy. - bool IsSplittable; - - /// \brief Test whether a partition has been marked as dead. - bool isDead() const { - if (BeginOffset == UINT64_MAX) { - assert(EndOffset == UINT64_MAX); - return true; - } - return false; - } - - /// \brief Kill a partition. - /// This is accomplished by setting both its beginning and end offset to - /// the maximum possible value. - void kill() { - assert(!isDead() && "He's Dead, Jim!"); - BeginOffset = EndOffset = UINT64_MAX; - } - - Partition() : ByteRange(), IsSplittable() {} - Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable) - : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {} -}; - -/// \brief A particular use of a partition of the alloca. -/// -/// This structure is used to associate uses of a partition with it. They -/// mark the range of bytes which are referenced by a particular instruction, -/// and includes a handle to the user itself and the pointer value in use. -/// The bounds of these uses are determined by intersecting the bounds of the -/// memory use itself with a particular partition. As a consequence there is -/// intentionally overlap between various uses of the same partition. -class PartitionUse : public ByteRange { - /// \brief Combined storage for both the Use* and split state. - PointerIntPair<Use*, 1, bool> UsePtrAndIsSplit; - -public: - PartitionUse() : ByteRange(), UsePtrAndIsSplit() {} - PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U, - bool IsSplit) - : ByteRange(BeginOffset, EndOffset), UsePtrAndIsSplit(U, IsSplit) {} - - /// \brief The use in question. Provides access to both user and used value. - /// - /// Note that this may be null if the partition use is *dead*, that is, it - /// should be ignored. - Use *getUse() const { return UsePtrAndIsSplit.getPointer(); } - - /// \brief Set the use for this partition use range. - void setUse(Use *U) { UsePtrAndIsSplit.setPointer(U); } - - /// \brief Whether this use is split across multiple partitions. - bool isSplit() const { return UsePtrAndIsSplit.getInt(); } -}; -} +} // end anonymous namespace namespace llvm { -template <> struct isPodLike<Partition> : llvm::true_type {}; -template <> struct isPodLike<PartitionUse> : llvm::true_type {}; +template <typename T> struct isPodLike; +template <> struct isPodLike<Slice> { + static const bool value = true; +}; } namespace { -/// \brief Alloca partitioning representation. +/// \brief Representation of the alloca slices. /// -/// This class represents a partitioning of an alloca into slices, and -/// information about the nature of uses of each slice of the alloca. The goal -/// is that this information is sufficient to decide if and how to split the -/// alloca apart and replace slices with scalars. It is also intended that this -/// structure can capture the relevant information needed both to decide about -/// and to enact these transformations. -class AllocaPartitioning { +/// This class represents the slices of an alloca which are formed by its +/// various uses. If a pointer escapes, we can't fully build a representation +/// for the slices used and we reflect that in this structure. The uses are +/// stored, sorted by increasing beginning offset and with unsplittable slices +/// starting at a particular offset before splittable slices. +class AllocaSlices { public: - /// \brief Construct a partitioning of a particular alloca. - /// - /// Construction does most of the work for partitioning the alloca. This - /// performs the necessary walks of users and builds a partitioning from it. - AllocaPartitioning(const DataLayout &TD, AllocaInst &AI); + /// \brief Construct the slices of a particular alloca. + AllocaSlices(const DataLayout &DL, AllocaInst &AI); /// \brief Test whether a pointer to the allocation escapes our analysis. /// - /// If this is true, the partitioning is never fully built and should be + /// If this is true, the slices are never fully built and should be /// ignored. bool isEscaped() const { return PointerEscapingInstr; } - /// \brief Support for iterating over the partitions. + /// \brief Support for iterating over the slices. /// @{ - typedef SmallVectorImpl<Partition>::iterator iterator; - iterator begin() { return Partitions.begin(); } - iterator end() { return Partitions.end(); } + typedef SmallVectorImpl<Slice>::iterator iterator; + iterator begin() { return Slices.begin(); } + iterator end() { return Slices.end(); } - typedef SmallVectorImpl<Partition>::const_iterator const_iterator; - const_iterator begin() const { return Partitions.begin(); } - const_iterator end() const { return Partitions.end(); } - /// @} - - /// \brief Support for iterating over and manipulating a particular - /// partition's uses. - /// - /// The iteration support provided for uses is more limited, but also - /// includes some manipulation routines to support rewriting the uses of - /// partitions during SROA. - /// @{ - typedef SmallVectorImpl<PartitionUse>::iterator use_iterator; - use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); } - use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); } - use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); } - use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); } - - typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator; - const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); } - const_use_iterator use_begin(const_iterator I) const { - return Uses[I - begin()].begin(); - } - const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); } - const_use_iterator use_end(const_iterator I) const { - return Uses[I - begin()].end(); - } - - unsigned use_size(unsigned Idx) const { return Uses[Idx].size(); } - unsigned use_size(const_iterator I) const { return Uses[I - begin()].size(); } - const PartitionUse &getUse(unsigned PIdx, unsigned UIdx) const { - return Uses[PIdx][UIdx]; - } - const PartitionUse &getUse(const_iterator I, unsigned UIdx) const { - return Uses[I - begin()][UIdx]; - } - - void use_push_back(unsigned Idx, const PartitionUse &PU) { - Uses[Idx].push_back(PU); - } - void use_push_back(const_iterator I, const PartitionUse &PU) { - Uses[I - begin()].push_back(PU); - } + typedef SmallVectorImpl<Slice>::const_iterator const_iterator; + const_iterator begin() const { return Slices.begin(); } + const_iterator end() const { return Slices.end(); } /// @} /// \brief Allow iterating the dead users for this alloca. @@ -320,66 +237,12 @@ public: dead_op_iterator dead_op_end() const { return DeadOperands.end(); } /// @} - /// \brief MemTransferInst auxiliary data. - /// This struct provides some auxiliary data about memory transfer - /// intrinsics such as memcpy and memmove. These intrinsics can use two - /// different ranges within the same alloca, and provide other challenges to - /// correctly represent. We stash extra data to help us untangle this - /// after the partitioning is complete. - struct MemTransferOffsets { - /// The destination begin and end offsets when the destination is within - /// this alloca. If the end offset is zero the destination is not within - /// this alloca. - uint64_t DestBegin, DestEnd; - - /// The source begin and end offsets when the source is within this alloca. - /// If the end offset is zero, the source is not within this alloca. - uint64_t SourceBegin, SourceEnd; - - /// Flag for whether an alloca is splittable. - bool IsSplittable; - }; - MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const { - return MemTransferInstData.lookup(&II); - } - - /// \brief Map from a PHI or select operand back to a partition. - /// - /// When manipulating PHI nodes or selects, they can use more than one - /// partition of an alloca. We store a special mapping to allow finding the - /// partition referenced by each of these operands, if any. - iterator findPartitionForPHIOrSelectOperand(Use *U) { - SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt - = PHIOrSelectOpMap.find(U); - if (MapIt == PHIOrSelectOpMap.end()) - return end(); - - return begin() + MapIt->second.first; - } - - /// \brief Map from a PHI or select operand back to the specific use of - /// a partition. - /// - /// Similar to mapping these operands back to the partitions, this maps - /// directly to the use structure of that partition. - use_iterator findPartitionUseForPHIOrSelectOperand(Use *U) { - SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt - = PHIOrSelectOpMap.find(U); - assert(MapIt != PHIOrSelectOpMap.end()); - return Uses[MapIt->second.first].begin() + MapIt->second.second; - } - - /// \brief Compute a common type among the uses of a particular partition. - /// - /// This routines walks all of the uses of a particular partition and tries - /// to find a common type between them. Untyped operations such as memset and - /// memcpy are ignored. - Type *getCommonType(iterator I) const; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; - void printUsers(raw_ostream &OS, const_iterator I, + void printSlice(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; + void printUse(raw_ostream &OS, const_iterator I, + StringRef Indent = " ") const; void print(raw_ostream &OS) const; void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const; void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const; @@ -387,47 +250,36 @@ public: private: template <typename DerivedT, typename RetT = void> class BuilderBase; - class PartitionBuilder; - friend class AllocaPartitioning::PartitionBuilder; - class UseBuilder; - friend class AllocaPartitioning::UseBuilder; + class SliceBuilder; + friend class AllocaSlices::SliceBuilder; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// \brief Handle to alloca instruction to simplify method interfaces. AllocaInst &AI; #endif - /// \brief The instruction responsible for this alloca having no partitioning. + /// \brief The instruction responsible for this alloca not having a known set + /// of slices. /// /// When an instruction (potentially) escapes the pointer to the alloca, we - /// store a pointer to that here and abort trying to partition the alloca. - /// This will be null if the alloca is partitioned successfully. + /// store a pointer to that here and abort trying to form slices of the + /// alloca. This will be null if the alloca slices are analyzed successfully. Instruction *PointerEscapingInstr; - /// \brief The partitions of the alloca. + /// \brief The slices of the alloca. /// - /// We store a vector of the partitions over the alloca here. This vector is - /// sorted by increasing begin offset, and then by decreasing end offset. See - /// the Partition inner class for more details. Initially (during - /// construction) there are overlaps, but we form a disjoint sequence of - /// partitions while finishing construction and a fully constructed object is - /// expected to always have this as a disjoint space. - SmallVector<Partition, 8> Partitions; - - /// \brief The uses of the partitions. - /// - /// This is essentially a mapping from each partition to a list of uses of - /// that partition. The mapping is done with a Uses vector that has the exact - /// same number of entries as the partition vector. Each entry is itself - /// a vector of the uses. - SmallVector<SmallVector<PartitionUse, 2>, 8> Uses; + /// We store a vector of the slices formed by uses of the alloca here. This + /// vector is sorted by increasing begin offset, and then the unsplittable + /// slices before the splittable ones. See the Slice inner class for more + /// details. + SmallVector<Slice, 8> Slices; /// \brief Instructions which will become dead if we rewrite the alloca. /// - /// Note that these are not separated by partition. This is because we expect - /// a partitioned alloca to be completely rewritten or not rewritten at all. - /// If rewritten, all these instructions can simply be removed and replaced - /// with undef as they come from outside of the allocated space. + /// Note that these are not separated by slice. This is because we expect an + /// alloca to be completely rewritten or not rewritten at all. If rewritten, + /// all these instructions can simply be removed and replaced with undef as + /// they come from outside of the allocated space. SmallVector<Instruction *, 8> DeadUsers; /// \brief Operands which will become dead if we rewrite the alloca. @@ -439,26 +291,6 @@ private: /// want to swap this particular input for undef to simplify the use lists of /// the alloca. SmallVector<Use *, 8> DeadOperands; - - /// \brief The underlying storage for auxiliary memcpy and memset info. - SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData; - - /// \brief A side datastructure used when building up the partitions and uses. - /// - /// This mapping is only really used during the initial building of the - /// partitioning so that we can retain information about PHI and select nodes - /// processed. - SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes; - - /// \brief Auxiliary information for particular PHI or select operands. - SmallDenseMap<Use *, std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap; - - /// \brief A utility routine called from the constructor. - /// - /// This does what it says on the tin. It is the key of the alloca partition - /// splitting and merging. After it is called we have the desired disjoint - /// collection of partitions. - void splitAndMergePartitions(); }; } @@ -474,29 +306,35 @@ static Value *foldSelectInst(SelectInst &SI) { return 0; } -/// \brief Builder for the alloca partitioning. +/// \brief Builder for the alloca slices. /// -/// This class builds an alloca partitioning by recursively visiting the uses -/// of an alloca and splitting the partitions for each load and store at each -/// offset. -class AllocaPartitioning::PartitionBuilder - : public PtrUseVisitor<PartitionBuilder> { - friend class PtrUseVisitor<PartitionBuilder>; - friend class InstVisitor<PartitionBuilder>; - typedef PtrUseVisitor<PartitionBuilder> Base; +/// This class builds a set of alloca slices by recursively visiting the uses +/// of an alloca and making a slice for each load and store at each offset. +class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> { + friend class PtrUseVisitor<SliceBuilder>; + friend class InstVisitor<SliceBuilder>; + typedef PtrUseVisitor<SliceBuilder> Base; const uint64_t AllocSize; - AllocaPartitioning &P; + AllocaSlices &S; + + SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap; + SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes; - SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap; + /// \brief Set to de-duplicate dead instructions found in the use walk. + SmallPtrSet<Instruction *, 4> VisitedDeadInsts; public: - PartitionBuilder(const DataLayout &DL, AllocaInst &AI, AllocaPartitioning &P) - : PtrUseVisitor<PartitionBuilder>(DL), - AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), - P(P) {} + SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &S) + : PtrUseVisitor<SliceBuilder>(DL), + AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), S(S) {} private: + void markAsDead(Instruction &I) { + if (VisitedDeadInsts.insert(&I)) + S.DeadUsers.push_back(&I); + } + void insertUse(Instruction &I, const APInt &Offset, uint64_t Size, bool IsSplittable = false) { // Completely skip uses which have a zero size or start either before or @@ -505,9 +343,9 @@ private: DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset << " which has zero size or starts outside of the " << AllocSize << " byte alloca:\n" - << " alloca: " << P.AI << "\n" + << " alloca: " << S.AI << "\n" << " use: " << I << "\n"); - return; + return markAsDead(I); } uint64_t BeginOffset = Offset.getZExtValue(); @@ -523,13 +361,26 @@ private: if (Size > AllocSize - BeginOffset) { DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset << " to remain within the " << AllocSize << " byte alloca:\n" - << " alloca: " << P.AI << "\n" + << " alloca: " << S.AI << "\n" << " use: " << I << "\n"); EndOffset = AllocSize; } - Partition New(BeginOffset, EndOffset, IsSplittable); - P.Partitions.push_back(New); + S.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable)); + } + + void visitBitCastInst(BitCastInst &BC) { + if (BC.use_empty()) + return markAsDead(BC); + + return Base::visitBitCastInst(BC); + } + + void visitGetElementPtrInst(GetElementPtrInst &GEPI) { + if (GEPI.use_empty()) + return markAsDead(GEPI); + + return Base::visitGetElementPtrInst(GEPI); } void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, @@ -580,9 +431,9 @@ private: DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset << " which extends past the end of the " << AllocSize << " byte alloca:\n" - << " alloca: " << P.AI << "\n" + << " alloca: " << S.AI << "\n" << " use: " << SI << "\n"); - return; + return markAsDead(SI); } assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && @@ -597,7 +448,7 @@ private: if ((Length && Length->getValue() == 0) || (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) // Zero-length mem transfer intrinsics can be ignored entirely. - return; + return markAsDead(II); if (!IsOffsetKnown) return PI.setAborted(&II); @@ -613,7 +464,7 @@ private: if ((Length && Length->getValue() == 0) || (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) // Zero-length mem transfer intrinsics can be ignored entirely. - return; + return markAsDead(II); if (!IsOffsetKnown) return PI.setAborted(&II); @@ -622,63 +473,44 @@ private: uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset; - MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; - - // Only intrinsics with a constant length can be split. - Offsets.IsSplittable = Length; + // Check for the special case where the same exact value is used for both + // source and dest. + if (*U == II.getRawDest() && *U == II.getRawSource()) { + // For non-volatile transfers this is a no-op. + if (!II.isVolatile()) + return markAsDead(II); - if (*U == II.getRawDest()) { - Offsets.DestBegin = RawOffset; - Offsets.DestEnd = RawOffset + Size; - } - if (*U == II.getRawSource()) { - Offsets.SourceBegin = RawOffset; - Offsets.SourceEnd = RawOffset + Size; + return insertUse(II, Offset, Size, /*IsSplittable=*/false); } - // If we have set up end offsets for both the source and the destination, - // we have found both sides of this transfer pointing at the same alloca. - bool SeenBothEnds = Offsets.SourceEnd && Offsets.DestEnd; - if (SeenBothEnds && II.getRawDest() != II.getRawSource()) { - unsigned PrevIdx = MemTransferPartitionMap[&II]; + // If we have seen both source and destination for a mem transfer, then + // they both point to the same alloca. + bool Inserted; + SmallDenseMap<Instruction *, unsigned>::iterator MTPI; + llvm::tie(MTPI, Inserted) = + MemTransferSliceMap.insert(std::make_pair(&II, S.Slices.size())); + unsigned PrevIdx = MTPI->second; + if (!Inserted) { + Slice &PrevP = S.Slices[PrevIdx]; // Check if the begin offsets match and this is a non-volatile transfer. // In that case, we can completely elide the transfer. - if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) { - P.Partitions[PrevIdx].kill(); - return; + if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) { + PrevP.kill(); + return markAsDead(II); } // Otherwise we have an offset transfer within the same alloca. We can't // split those. - P.Partitions[PrevIdx].IsSplittable = Offsets.IsSplittable = false; - } else if (SeenBothEnds) { - // Handle the case where this exact use provides both ends of the - // operation. - assert(II.getRawDest() == II.getRawSource()); - - // For non-volatile transfers this is a no-op. - if (!II.isVolatile()) - return; - - // Otherwise just suppress splitting. - Offsets.IsSplittable = false; + PrevP.makeUnsplittable(); } - // Insert the use now that we've fixed up the splittable nature. - insertUse(II, Offset, Size, Offsets.IsSplittable); - - // Setup the mapping from intrinsic to partition of we've not seen both - // ends of this transfer. - if (!SeenBothEnds) { - unsigned NewIdx = P.Partitions.size() - 1; - bool Inserted - = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)).second; - assert(Inserted && - "Already have intrinsic in map but haven't seen both ends"); - (void)Inserted; - } + insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length); + + // Check that we ended up with a valid index in the map. + assert(S.Slices[PrevIdx].getUse()->getUser() == &II && + "Map index doesn't point back to a slice with this user."); } // Disable SRoA for any intrinsics except for lifetime invariants. @@ -702,7 +534,7 @@ private: Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { // We consider any PHI or select that results in a direct load or store of - // the same offset to be a viable use for partitioning purposes. These uses + // the same offset to be a viable use for slicing purposes. These uses // are considered unsplittable and the size is the maximum loaded or stored // size. SmallPtrSet<Instruction *, 4> Visited; @@ -747,234 +579,36 @@ private: void visitPHINode(PHINode &PN) { if (PN.use_empty()) - return; + return markAsDead(PN); if (!IsOffsetKnown) return PI.setAborted(&PN); // See if we already have computed info on this node. - std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN]; - if (PHIInfo.first) { - PHIInfo.second = true; - insertUse(PN, Offset, PHIInfo.first); - return; + uint64_t &PHISize = PHIOrSelectSizes[&PN]; + if (!PHISize) { + // This is a new PHI node, check for an unsafe use of the PHI node. + if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHISize)) + return PI.setAborted(UnsafeI); } - // Check for an unsafe use of the PHI node. - if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first)) - return PI.setAborted(UnsafeI); - - insertUse(PN, Offset, PHIInfo.first); - } - - void visitSelectInst(SelectInst &SI) { - if (SI.use_empty()) - return; - if (Value *Result = foldSelectInst(SI)) { - if (Result == *U) - // If the result of the constant fold will be the pointer, recurse - // through the select as if we had RAUW'ed it. - enqueueUsers(SI); - - return; - } - if (!IsOffsetKnown) - return PI.setAborted(&SI); - - // See if we already have computed info on this node. - std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI]; - if (SelectInfo.first) { - SelectInfo.second = true; - insertUse(SI, Offset, SelectInfo.first); - return; - } - - // Check for an unsafe use of the PHI node. - if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first)) - return PI.setAborted(UnsafeI); - - insertUse(SI, Offset, SelectInfo.first); - } - - /// \brief Disable SROA entirely if there are unhandled users of the alloca. - void visitInstruction(Instruction &I) { - PI.setAborted(&I); - } -}; - -/// \brief Use adder for the alloca partitioning. -/// -/// This class adds the uses of an alloca to all of the partitions which they -/// use. For splittable partitions, this can end up doing essentially a linear -/// walk of the partitions, but the number of steps remains bounded by the -/// total result instruction size: -/// - The number of partitions is a result of the number unsplittable -/// instructions using the alloca. -/// - The number of users of each partition is at worst the total number of -/// splittable instructions using the alloca. -/// Thus we will produce N * M instructions in the end, where N are the number -/// of unsplittable uses and M are the number of splittable. This visitor does -/// the exact same number of updates to the partitioning. -/// -/// In the more common case, this visitor will leverage the fact that the -/// partition space is pre-sorted, and do a logarithmic search for the -/// partition needed, making the total visit a classical ((N + M) * log(N)) -/// complexity operation. -class AllocaPartitioning::UseBuilder : public PtrUseVisitor<UseBuilder> { - friend class PtrUseVisitor<UseBuilder>; - friend class InstVisitor<UseBuilder>; - typedef PtrUseVisitor<UseBuilder> Base; - - const uint64_t AllocSize; - AllocaPartitioning &P; - - /// \brief Set to de-duplicate dead instructions found in the use walk. - SmallPtrSet<Instruction *, 4> VisitedDeadInsts; - -public: - UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) - : PtrUseVisitor<UseBuilder>(TD), - AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())), - P(P) {} - -private: - void markAsDead(Instruction &I) { - if (VisitedDeadInsts.insert(&I)) - P.DeadUsers.push_back(&I); - } - - void insertUse(Instruction &User, const APInt &Offset, uint64_t Size) { - // If the use has a zero size or extends outside of the allocation, record - // it as a dead use for elimination later. - if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) - return markAsDead(User); - - uint64_t BeginOffset = Offset.getZExtValue(); - uint64_t EndOffset = BeginOffset + Size; - - // Clamp the end offset to the end of the allocation. Note that this is - // formulated to handle even the case where "BeginOffset + Size" overflows. - assert(AllocSize >= BeginOffset); // Established above. - if (Size > AllocSize - BeginOffset) - EndOffset = AllocSize; - - // NB: This only works if we have zero overlapping partitions. - iterator I = std::lower_bound(P.begin(), P.end(), BeginOffset); - if (I != P.begin() && llvm::prior(I)->EndOffset > BeginOffset) - I = llvm::prior(I); - iterator E = P.end(); - bool IsSplit = llvm::next(I) != E && llvm::next(I)->BeginOffset < EndOffset; - for (; I != E && I->BeginOffset < EndOffset; ++I) { - PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset), - std::min(I->EndOffset, EndOffset), U, IsSplit); - P.use_push_back(I, NewPU); - if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser())) - P.PHIOrSelectOpMap[U] - = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1); - } - } - - void visitBitCastInst(BitCastInst &BC) { - if (BC.use_empty()) - return markAsDead(BC); - - return Base::visitBitCastInst(BC); - } - - void visitGetElementPtrInst(GetElementPtrInst &GEPI) { - if (GEPI.use_empty()) - return markAsDead(GEPI); - - return Base::visitGetElementPtrInst(GEPI); - } - - void visitLoadInst(LoadInst &LI) { - assert(IsOffsetKnown); - uint64_t Size = DL.getTypeStoreSize(LI.getType()); - insertUse(LI, Offset, Size); - } - - void visitStoreInst(StoreInst &SI) { - assert(IsOffsetKnown); - uint64_t Size = DL.getTypeStoreSize(SI.getOperand(0)->getType()); - - // If this memory access can be shown to *statically* extend outside the - // bounds of of the allocation, it's behavior is undefined, so simply - // ignore it. Note that this is more strict than the generic clamping - // behavior of insertUse. - if (Offset.isNegative() || Size > AllocSize || - Offset.ugt(AllocSize - Size)) - return markAsDead(SI); - - insertUse(SI, Offset, Size); - } - - void visitMemSetInst(MemSetInst &II) { - ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); - if ((Length && Length->getValue() == 0) || - (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) - return markAsDead(II); - - assert(IsOffsetKnown); - insertUse(II, Offset, Length ? Length->getLimitedValue() - : AllocSize - Offset.getLimitedValue()); - } - - void visitMemTransferInst(MemTransferInst &II) { - ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); - if ((Length && Length->getValue() == 0) || - (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) - return markAsDead(II); - - assert(IsOffsetKnown); - uint64_t Size = Length ? Length->getLimitedValue() - : AllocSize - Offset.getLimitedValue(); - - const MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; - if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd && - Offsets.DestBegin == Offsets.SourceBegin) - return markAsDead(II); // Skip identity transfers without side-effects. - - insertUse(II, Offset, Size); - } - - void visitIntrinsicInst(IntrinsicInst &II) { - assert(IsOffsetKnown); - assert(II.getIntrinsicID() == Intrinsic::lifetime_start || - II.getIntrinsicID() == Intrinsic::lifetime_end); - - ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); - insertUse(II, Offset, std::min(Length->getLimitedValue(), - AllocSize - Offset.getLimitedValue())); - } - - void insertPHIOrSelect(Instruction &User, const APInt &Offset) { - uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first; - // For PHI and select operands outside the alloca, we can't nuke the entire // phi or select -- the other side might still be relevant, so we special // case them here and use a separate structure to track the operands // themselves which should be replaced with undef. - if ((Offset.isNegative() && Offset.uge(Size)) || + // FIXME: This should instead be escaped in the event we're instrumenting + // for address sanitization. + if ((Offset.isNegative() && (-Offset).uge(PHISize)) || (!Offset.isNegative() && Offset.uge(AllocSize))) { - P.DeadOperands.push_back(U); + S.DeadOperands.push_back(U); return; } - insertUse(User, Offset, Size); - } - - void visitPHINode(PHINode &PN) { - if (PN.use_empty()) - return markAsDead(PN); - - assert(IsOffsetKnown); - insertPHIOrSelect(PN, Offset); + insertUse(PN, Offset, PHISize); } void visitSelectInst(SelectInst &SI) { if (SI.use_empty()) return markAsDead(SI); - if (Value *Result = foldSelectInst(SI)) { if (Result == *U) // If the result of the constant fold will be the pointer, recurse @@ -983,276 +617,106 @@ private: else // Otherwise the operand to the select is dead, and we can replace it // with undef. - P.DeadOperands.push_back(U); + S.DeadOperands.push_back(U); return; } + if (!IsOffsetKnown) + return PI.setAborted(&SI); - assert(IsOffsetKnown); - insertPHIOrSelect(SI, Offset); - } - - /// \brief Unreachable, we've already visited the alloca once. - void visitInstruction(Instruction &I) { - llvm_unreachable("Unhandled instruction in use builder."); - } -}; - -void AllocaPartitioning::splitAndMergePartitions() { - size_t NumDeadPartitions = 0; - - // Track the range of splittable partitions that we pass when accumulating - // overlapping unsplittable partitions. - uint64_t SplitEndOffset = 0ull; - - Partition New(0ull, 0ull, false); - - for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) { - ++j; - - if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) { - assert(New.BeginOffset == New.EndOffset); - New = Partitions[i]; - } else { - assert(New.IsSplittable); - New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset); - } - assert(New.BeginOffset != New.EndOffset); - - // Scan the overlapping partitions. - while (j != e && New.EndOffset > Partitions[j].BeginOffset) { - // If the new partition we are forming is splittable, stop at the first - // unsplittable partition. - if (New.IsSplittable && !Partitions[j].IsSplittable) - break; - - // Grow the new partition to include any equally splittable range. 'j' is - // always equally splittable when New is splittable, but when New is not - // splittable, we may subsume some (or part of some) splitable partition - // without growing the new one. - if (New.IsSplittable == Partitions[j].IsSplittable) { - New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset); - } else { - assert(!New.IsSplittable); - assert(Partitions[j].IsSplittable); - SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset); - } - - Partitions[j].kill(); - ++NumDeadPartitions; - ++j; - } - - // If the new partition is splittable, chop off the end as soon as the - // unsplittable subsequent partition starts and ensure we eventually cover - // the splittable area. - if (j != e && New.IsSplittable) { - SplitEndOffset = std::max(SplitEndOffset, New.EndOffset); - New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); + // See if we already have computed info on this node. + uint64_t &SelectSize = PHIOrSelectSizes[&SI]; + if (!SelectSize) { + // This is a new Select, check for an unsafe use of it. + if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectSize)) + return PI.setAborted(UnsafeI); } - // Add the new partition if it differs from the original one and is - // non-empty. We can end up with an empty partition here if it was - // splittable but there is an unsplittable one that starts at the same - // offset. - if (New != Partitions[i]) { - if (New.BeginOffset != New.EndOffset) - Partitions.push_back(New); - // Mark the old one for removal. - Partitions[i].kill(); - ++NumDeadPartitions; + // For PHI and select operands outside the alloca, we can't nuke the entire + // phi or select -- the other side might still be relevant, so we special + // case them here and use a separate structure to track the operands + // themselves which should be replaced with undef. + // FIXME: This should instead be escaped in the event we're instrumenting + // for address sanitization. + if ((Offset.isNegative() && Offset.uge(SelectSize)) || + (!Offset.isNegative() && Offset.uge(AllocSize))) { + S.DeadOperands.push_back(U); + return; } - New.BeginOffset = New.EndOffset; - if (!New.IsSplittable) { - New.EndOffset = std::max(New.EndOffset, SplitEndOffset); - if (j != e && !Partitions[j].IsSplittable) - New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); - New.IsSplittable = true; - // If there is a trailing splittable partition which won't be fused into - // the next splittable partition go ahead and add it onto the partitions - // list. - if (New.BeginOffset < New.EndOffset && - (j == e || !Partitions[j].IsSplittable || - New.EndOffset < Partitions[j].BeginOffset)) { - Partitions.push_back(New); - New.BeginOffset = New.EndOffset = 0ull; - } - } + insertUse(SI, Offset, SelectSize); } - // Re-sort the partitions now that they have been split and merged into - // disjoint set of partitions. Also remove any of the dead partitions we've - // replaced in the process. - std::sort(Partitions.begin(), Partitions.end()); - if (NumDeadPartitions) { - assert(Partitions.back().isDead()); - assert((ptrdiff_t)NumDeadPartitions == - std::count(Partitions.begin(), Partitions.end(), Partitions.back())); + /// \brief Disable SROA entirely if there are unhandled users of the alloca. + void visitInstruction(Instruction &I) { + PI.setAborted(&I); } - Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end()); -} +}; -AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI) +AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) : #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) AI(AI), #endif PointerEscapingInstr(0) { - PartitionBuilder PB(TD, AI, *this); - PartitionBuilder::PtrInfo PtrI = PB.visitPtr(AI); + SliceBuilder PB(DL, AI, *this); + SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI); if (PtrI.isEscaped() || PtrI.isAborted()) { // FIXME: We should sink the escape vs. abort info into the caller nicely, - // possibly by just storing the PtrInfo in the AllocaPartitioning. + // possibly by just storing the PtrInfo in the AllocaSlices. PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst() : PtrI.getAbortingInst(); assert(PointerEscapingInstr && "Did not track a bad instruction"); return; } + Slices.erase(std::remove_if(Slices.begin(), Slices.end(), + std::mem_fun_ref(&Slice::isDead)), + Slices.end()); + // Sort the uses. This arranges for the offsets to be in ascending order, // and the sizes to be in descending order. - std::sort(Partitions.begin(), Partitions.end()); - - // Remove any partitions from the back which are marked as dead. - while (!Partitions.empty() && Partitions.back().isDead()) - Partitions.pop_back(); - - if (Partitions.size() > 1) { - // Intersect splittability for all partitions with equal offsets and sizes. - // Then remove all but the first so that we have a sequence of non-equal but - // potentially overlapping partitions. - for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E; - I = J) { - ++J; - while (J != E && *I == *J) { - I->IsSplittable &= J->IsSplittable; - ++J; - } - } - Partitions.erase(std::unique(Partitions.begin(), Partitions.end()), - Partitions.end()); - - // Split splittable and merge unsplittable partitions into a disjoint set - // of partitions over the used space of the allocation. - splitAndMergePartitions(); - } - - // Record how many partitions we end up with. - NumAllocaPartitions += Partitions.size(); - MaxPartitionsPerAlloca = std::max<unsigned>(Partitions.size(), MaxPartitionsPerAlloca); - - // Now build up the user lists for each of these disjoint partitions by - // re-walking the recursive users of the alloca. - Uses.resize(Partitions.size()); - UseBuilder UB(TD, AI, *this); - PtrI = UB.visitPtr(AI); - assert(!PtrI.isEscaped() && "Previously analyzed pointer now escapes!"); - assert(!PtrI.isAborted() && "Early aborted the visit of the pointer."); - - unsigned NumUses = 0; -#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS) - for (unsigned Idx = 0, Size = Uses.size(); Idx != Size; ++Idx) - NumUses += Uses[Idx].size(); -#endif - NumAllocaPartitionUses += NumUses; - MaxPartitionUsesPerAlloca = std::max<unsigned>(NumUses, MaxPartitionUsesPerAlloca); + std::sort(Slices.begin(), Slices.end()); } -Type *AllocaPartitioning::getCommonType(iterator I) const { - Type *Ty = 0; - for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { - Use *U = UI->getUse(); - if (!U) - continue; // Skip dead uses. - if (isa<IntrinsicInst>(*U->getUser())) - continue; - if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset) - continue; - - Type *UserTy = 0; - if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) - UserTy = LI->getType(); - else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) - UserTy = SI->getValueOperand()->getType(); - else - return 0; // Bail if we have weird uses. - - if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) { - // If the type is larger than the partition, skip it. We only encounter - // this for split integer operations where we want to use the type of the - // entity causing the split. - if (ITy->getBitWidth() > (I->EndOffset - I->BeginOffset)*8) - continue; - - // If we have found an integer type use covering the alloca, use that - // regardless of the other types, as integers are often used for a "bucket - // of bits" type. - return ITy; - } - - if (Ty && Ty != UserTy) - return 0; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - Ty = UserTy; - } - return Ty; +void AllocaSlices::print(raw_ostream &OS, const_iterator I, + StringRef Indent) const { + printSlice(OS, I, Indent); + printUse(OS, I, Indent); } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - -void AllocaPartitioning::print(raw_ostream &OS, const_iterator I, - StringRef Indent) const { - OS << Indent << "partition #" << (I - begin()) - << " [" << I->BeginOffset << "," << I->EndOffset << ")" - << (I->IsSplittable ? " (splittable)" : "") - << (Uses[I - begin()].empty() ? " (zero uses)" : "") - << "\n"; +void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I, + StringRef Indent) const { + OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")" + << " slice #" << (I - begin()) + << (I->isSplittable() ? " (splittable)" : "") << "\n"; } -void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I, - StringRef Indent) const { - for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { - if (!UI->getUse()) - continue; // Skip dead uses. - OS << Indent << " [" << UI->BeginOffset << "," << UI->EndOffset << ") " - << "used by: " << *UI->getUse()->getUser() << "\n"; - if (MemTransferInst *II = - dyn_cast<MemTransferInst>(UI->getUse()->getUser())) { - const MemTransferOffsets &MTO = MemTransferInstData.lookup(II); - bool IsDest; - if (!MTO.IsSplittable) - IsDest = UI->BeginOffset == MTO.DestBegin; - else - IsDest = MTO.DestBegin != 0u; - OS << Indent << " (original " << (IsDest ? "dest" : "source") << ": " - << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin) - << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n"; - } - } +void AllocaSlices::printUse(raw_ostream &OS, const_iterator I, + StringRef Indent) const { + OS << Indent << " used by: " << *I->getUse()->getUser() << "\n"; } -void AllocaPartitioning::print(raw_ostream &OS) const { +void AllocaSlices::print(raw_ostream &OS) const { if (PointerEscapingInstr) { - OS << "No partitioning for alloca: " << AI << "\n" + OS << "Can't analyze slices for alloca: " << AI << "\n" << " A pointer to this alloca escaped by:\n" << " " << *PointerEscapingInstr << "\n"; return; } - OS << "Partitioning of alloca: " << AI << "\n"; - for (const_iterator I = begin(), E = end(); I != E; ++I) { + OS << "Slices of alloca: " << AI << "\n"; + for (const_iterator I = begin(), E = end(); I != E; ++I) print(OS, I); - printUsers(OS, I); - } } -void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); } -void AllocaPartitioning::dump() const { print(dbgs()); } +void AllocaSlices::dump(const_iterator I) const { print(dbgs(), I); } +void AllocaSlices::dump() const { print(dbgs()); } #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - namespace { /// \brief Implementation of LoadAndStorePromoter for promoting allocas. /// @@ -1269,12 +733,13 @@ class AllocaPromoter : public LoadAndStorePromoter { SmallVector<DbgValueInst *, 4> DVIs; public: - AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, + AllocaPromoter(const SmallVectorImpl<Instruction *> &Insts, SSAUpdater &S, AllocaInst &AI, DIBuilder &DIB) - : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} + : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} void run(const SmallVectorImpl<Instruction*> &Insts) { - // Remember which alloca we're promoting (for isInstInList). + // Retain the debug information attached to the alloca for use when + // rewriting loads and stores. if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { for (Value::use_iterator UI = DebugNode->use_begin(), UE = DebugNode->use_end(); @@ -1286,7 +751,9 @@ public: } LoadAndStorePromoter::run(Insts); - AI.eraseFromParent(); + + // While we have the debug information, clear it off of the alloca. The + // caller takes care of deleting the alloca. while (!DDIs.empty()) DDIs.pop_back_val()->eraseFromParent(); while (!DVIs.empty()) @@ -1295,13 +762,34 @@ public: virtual bool isInstInList(Instruction *I, const SmallVectorImpl<Instruction*> &Insts) const { + Value *Ptr; if (LoadInst *LI = dyn_cast<LoadInst>(I)) - return LI->getOperand(0) == &AI; - return cast<StoreInst>(I)->getPointerOperand() == &AI; + Ptr = LI->getOperand(0); + else + Ptr = cast<StoreInst>(I)->getPointerOperand(); + + // Only used to detect cycles, which will be rare and quickly found as + // we're walking up a chain of defs rather than down through uses. + SmallPtrSet<Value *, 4> Visited; + + do { + if (Ptr == &AI) + return true; + + if (BitCastInst *BCI = dyn_cast<BitCastInst>(Ptr)) + Ptr = BCI->getOperand(0); + else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) + Ptr = GEPI->getPointerOperand(); + else + return false; + + } while (Visited.insert(Ptr)); + + return false; } virtual void updateDebugInfo(Instruction *Inst) const { - for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), + for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) @@ -1309,7 +797,7 @@ public: else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) ConvertDebugDeclareToDebugValue(DDI, LI, DIB); } - for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), + for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; Value *Arg = 0; @@ -1360,7 +848,7 @@ class SROA : public FunctionPass { const bool RequiresDomTree; LLVMContext *C; - const DataLayout *TD; + const DataLayout *DL; DominatorTree *DT; /// \brief Worklist of alloca instructions to simplify. @@ -1390,10 +878,25 @@ class SROA : public FunctionPass { /// \brief A collection of alloca instructions we can directly promote. std::vector<AllocaInst *> PromotableAllocas; + /// \brief A worklist of PHIs to speculate prior to promoting allocas. + /// + /// All of these PHIs have been checked for the safety of speculation and by + /// being speculated will allow promoting allocas currently in the promotable + /// queue. + SetVector<PHINode *, SmallVector<PHINode *, 2> > SpeculatablePHIs; + + /// \brief A worklist of select instructions to speculate prior to promoting + /// allocas. + /// + /// All of these select instructions have been checked for the safety of + /// speculation and by being speculated will allow promoting allocas + /// currently in the promotable queue. + SetVector<SelectInst *, SmallVector<SelectInst *, 2> > SpeculatableSelects; + public: SROA(bool RequiresDomTree = true) : FunctionPass(ID), RequiresDomTree(RequiresDomTree), - C(0), TD(0), DT(0) { + C(0), DL(0), DT(0) { initializeSROAPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F); @@ -1404,13 +907,13 @@ public: private: friend class PHIOrSelectSpeculator; - friend class AllocaPartitionRewriter; - friend class AllocaPartitionVectorRewriter; + friend class AllocaSliceRewriter; - bool rewriteAllocaPartition(AllocaInst &AI, - AllocaPartitioning &P, - AllocaPartitioning::iterator PI); - bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P); + bool rewritePartition(AllocaInst &AI, AllocaSlices &S, + AllocaSlices::iterator B, AllocaSlices::iterator E, + int64_t BeginOffset, int64_t EndOffset, + ArrayRef<AllocaSlices::iterator> SplitUses); + bool splitAlloca(AllocaInst &AI, AllocaSlices &S); bool runOnAlloca(AllocaInst &AI); void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas); bool promoteAllocas(Function &F); @@ -1429,286 +932,255 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTree) INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false, false) -namespace { -/// \brief Visitor to speculate PHIs and Selects where possible. -class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> { - // Befriend the base class so it can delegate to private visit methods. - friend class llvm::InstVisitor<PHIOrSelectSpeculator>; - - const DataLayout &TD; - AllocaPartitioning &P; - SROA &Pass; +/// Walk the range of a partitioning looking for a common type to cover this +/// sequence of slices. +static Type *findCommonType(AllocaSlices::const_iterator B, + AllocaSlices::const_iterator E, + uint64_t EndOffset) { + Type *Ty = 0; + bool IgnoreNonIntegralTypes = false; + for (AllocaSlices::const_iterator I = B; I != E; ++I) { + Use *U = I->getUse(); + if (isa<IntrinsicInst>(*U->getUser())) + continue; + if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset) + continue; -public: - PHIOrSelectSpeculator(const DataLayout &TD, AllocaPartitioning &P, SROA &Pass) - : TD(TD), P(P), Pass(Pass) {} - - /// \brief Visit the users of an alloca partition and rewrite them. - void visitUsers(AllocaPartitioning::const_iterator PI) { - // Note that we need to use an index here as the underlying vector of uses - // may be grown during speculation. However, we never need to re-visit the - // new uses, and so we can use the initial size bound. - for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) { - const PartitionUse &PU = P.getUse(PI, Idx); - if (!PU.getUse()) - continue; // Skip dead use. - - visit(cast<Instruction>(PU.getUse()->getUser())); + Type *UserTy = 0; + if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { + UserTy = LI->getType(); + } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { + UserTy = SI->getValueOperand()->getType(); + } else { + IgnoreNonIntegralTypes = true; // Give up on anything but an iN type. + continue; } - } -private: - // By default, skip this instruction. - void visitInstruction(Instruction &I) {} - - /// PHI instructions that use an alloca and are subsequently loaded can be - /// rewritten to load both input pointers in the pred blocks and then PHI the - /// results, allowing the load of the alloca to be promoted. - /// From this: - /// %P2 = phi [i32* %Alloca, i32* %Other] - /// %V = load i32* %P2 - /// to: - /// %V1 = load i32* %Alloca -> will be mem2reg'd - /// ... - /// %V2 = load i32* %Other - /// ... - /// %V = phi [i32 %V1, i32 %V2] - /// - /// We can do this to a select if its only uses are loads and if the operands - /// to the select can be loaded unconditionally. - /// - /// FIXME: This should be hoisted into a generic utility, likely in - /// Transforms/Util/Local.h - bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) { - // For now, we can only do this promotion if the load is in the same block - // as the PHI, and if there are no stores between the phi and load. - // TODO: Allow recursive phi users. - // TODO: Allow stores. - BasicBlock *BB = PN.getParent(); - unsigned MaxAlign = 0; - for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); - UI != UE; ++UI) { - LoadInst *LI = dyn_cast<LoadInst>(*UI); - if (LI == 0 || !LI->isSimple()) return false; - - // For now we only allow loads in the same block as the PHI. This is - // a common case that happens when instcombine merges two loads through - // a PHI. - if (LI->getParent() != BB) return false; - - // Ensure that there are no instructions between the PHI and the load that - // could store. - for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) - if (BBI->mayWriteToMemory()) - return false; - - MaxAlign = std::max(MaxAlign, LI->getAlignment()); - Loads.push_back(LI); + if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) { + // If the type is larger than the partition, skip it. We only encounter + // this for split integer operations where we want to use the type of the + // entity causing the split. Also skip if the type is not a byte width + // multiple. + if (ITy->getBitWidth() % 8 != 0 || + ITy->getBitWidth() / 8 > (EndOffset - B->beginOffset())) + continue; + + // If we have found an integer type use covering the alloca, use that + // regardless of the other types, as integers are often used for + // a "bucket of bits" type. + // + // NB: This *must* be the only return from inside the loop so that the + // order of slices doesn't impact the computed type. + return ITy; + } else if (IgnoreNonIntegralTypes) { + continue; } - // We can only transform this if it is safe to push the loads into the - // predecessor blocks. The only thing to watch out for is that we can't put - // a possibly trapping load in the predecessor if it is a critical edge. - for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { - TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); - Value *InVal = PN.getIncomingValue(Idx); - - // If the value is produced by the terminator of the predecessor (an - // invoke) or it has side-effects, there is no valid place to put a load - // in the predecessor. - if (TI == InVal || TI->mayHaveSideEffects()) - return false; + if (Ty && Ty != UserTy) + IgnoreNonIntegralTypes = true; // Give up on anything but an iN type. - // If the predecessor has a single successor, then the edge isn't - // critical. - if (TI->getNumSuccessors() == 1) - continue; + Ty = UserTy; + } + return Ty; +} - // If this pointer is always safe to load, or if we can prove that there - // is already a load in the block, then we can move the load to the pred - // block. - if (InVal->isDereferenceablePointer() || - isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD)) - continue; +/// PHI instructions that use an alloca and are subsequently loaded can be +/// rewritten to load both input pointers in the pred blocks and then PHI the +/// results, allowing the load of the alloca to be promoted. +/// From this: +/// %P2 = phi [i32* %Alloca, i32* %Other] +/// %V = load i32* %P2 +/// to: +/// %V1 = load i32* %Alloca -> will be mem2reg'd +/// ... +/// %V2 = load i32* %Other +/// ... +/// %V = phi [i32 %V1, i32 %V2] +/// +/// We can do this to a select if its only uses are loads and if the operands +/// to the select can be loaded unconditionally. +/// +/// FIXME: This should be hoisted into a generic utility, likely in +/// Transforms/Util/Local.h +static bool isSafePHIToSpeculate(PHINode &PN, + const DataLayout *DL = 0) { + // For now, we can only do this promotion if the load is in the same block + // as the PHI, and if there are no stores between the phi and load. + // TODO: Allow recursive phi users. + // TODO: Allow stores. + BasicBlock *BB = PN.getParent(); + unsigned MaxAlign = 0; + bool HaveLoad = false; + for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); UI != UE; + ++UI) { + LoadInst *LI = dyn_cast<LoadInst>(*UI); + if (LI == 0 || !LI->isSimple()) + return false; + // For now we only allow loads in the same block as the PHI. This is + // a common case that happens when instcombine merges two loads through + // a PHI. + if (LI->getParent() != BB) return false; - } - return true; + // Ensure that there are no instructions between the PHI and the load that + // could store. + for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) + if (BBI->mayWriteToMemory()) + return false; + + MaxAlign = std::max(MaxAlign, LI->getAlignment()); + HaveLoad = true; } - void visitPHINode(PHINode &PN) { - DEBUG(dbgs() << " original: " << PN << "\n"); + if (!HaveLoad) + return false; - SmallVector<LoadInst *, 4> Loads; - if (!isSafePHIToSpeculate(PN, Loads)) - return; + // We can only transform this if it is safe to push the loads into the + // predecessor blocks. The only thing to watch out for is that we can't put + // a possibly trapping load in the predecessor if it is a critical edge. + for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { + TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); + Value *InVal = PN.getIncomingValue(Idx); + + // If the value is produced by the terminator of the predecessor (an + // invoke) or it has side-effects, there is no valid place to put a load + // in the predecessor. + if (TI == InVal || TI->mayHaveSideEffects()) + return false; - assert(!Loads.empty()); + // If the predecessor has a single successor, then the edge isn't + // critical. + if (TI->getNumSuccessors() == 1) + continue; - Type *LoadTy = cast<PointerType>(PN.getType())->getElementType(); - IRBuilderTy PHIBuilder(&PN); - PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), - PN.getName() + ".sroa.speculated"); + // If this pointer is always safe to load, or if we can prove that there + // is already a load in the block, then we can move the load to the pred + // block. + if (InVal->isDereferenceablePointer() || + isSafeToLoadUnconditionally(InVal, TI, MaxAlign, DL)) + continue; - // Get the TBAA tag and alignment to use from one of the loads. It doesn't - // matter which one we get and if any differ. - LoadInst *SomeLoad = cast<LoadInst>(Loads.back()); - MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); - unsigned Align = SomeLoad->getAlignment(); + return false; + } - // Rewrite all loads of the PN to use the new PHI. - do { - LoadInst *LI = Loads.pop_back_val(); - LI->replaceAllUsesWith(NewPN); - Pass.DeadInsts.insert(LI); - } while (!Loads.empty()); - - // Inject loads into all of the pred blocks. - for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { - BasicBlock *Pred = PN.getIncomingBlock(Idx); - TerminatorInst *TI = Pred->getTerminator(); - Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx)); - Value *InVal = PN.getIncomingValue(Idx); - IRBuilderTy PredBuilder(TI); - - LoadInst *Load - = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." + - Pred->getName())); - ++NumLoadsSpeculated; - Load->setAlignment(Align); - if (TBAATag) - Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); - NewPN->addIncoming(Load, Pred); - - Instruction *Ptr = dyn_cast<Instruction>(InVal); - if (!Ptr) - // No uses to rewrite. - continue; + return true; +} - // Try to lookup and rewrite any partition uses corresponding to this phi - // input. - AllocaPartitioning::iterator PI - = P.findPartitionForPHIOrSelectOperand(InUse); - if (PI == P.end()) - continue; +static void speculatePHINodeLoads(PHINode &PN) { + DEBUG(dbgs() << " original: " << PN << "\n"); + + Type *LoadTy = cast<PointerType>(PN.getType())->getElementType(); + IRBuilderTy PHIBuilder(&PN); + PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), + PN.getName() + ".sroa.speculated"); + + // Get the TBAA tag and alignment to use from one of the loads. It doesn't + // matter which one we get and if any differ. + LoadInst *SomeLoad = cast<LoadInst>(*PN.use_begin()); + MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); + unsigned Align = SomeLoad->getAlignment(); + + // Rewrite all loads of the PN to use the new PHI. + while (!PN.use_empty()) { + LoadInst *LI = cast<LoadInst>(*PN.use_begin()); + LI->replaceAllUsesWith(NewPN); + LI->eraseFromParent(); + } + + // Inject loads into all of the pred blocks. + for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { + BasicBlock *Pred = PN.getIncomingBlock(Idx); + TerminatorInst *TI = Pred->getTerminator(); + Value *InVal = PN.getIncomingValue(Idx); + IRBuilderTy PredBuilder(TI); + + LoadInst *Load = PredBuilder.CreateLoad( + InVal, (PN.getName() + ".sroa.speculate.load." + Pred->getName())); + ++NumLoadsSpeculated; + Load->setAlignment(Align); + if (TBAATag) + Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); + NewPN->addIncoming(Load, Pred); + } + + DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); + PN.eraseFromParent(); +} - // Replace the Use in the PartitionUse for this operand with the Use - // inside the load. - AllocaPartitioning::use_iterator UI - = P.findPartitionUseForPHIOrSelectOperand(InUse); - assert(isa<PHINode>(*UI->getUse()->getUser())); - UI->setUse(&Load->getOperandUse(Load->getPointerOperandIndex())); - } - DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); - } - - /// Select instructions that use an alloca and are subsequently loaded can be - /// rewritten to load both input pointers and then select between the result, - /// allowing the load of the alloca to be promoted. - /// From this: - /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other - /// %V = load i32* %P2 - /// to: - /// %V1 = load i32* %Alloca -> will be mem2reg'd - /// %V2 = load i32* %Other - /// %V = select i1 %cond, i32 %V1, i32 %V2 - /// - /// We can do this to a select if its only uses are loads and if the operand - /// to the select can be loaded unconditionally. - bool isSafeSelectToSpeculate(SelectInst &SI, - SmallVectorImpl<LoadInst *> &Loads) { - Value *TValue = SI.getTrueValue(); - Value *FValue = SI.getFalseValue(); - bool TDerefable = TValue->isDereferenceablePointer(); - bool FDerefable = FValue->isDereferenceablePointer(); - - for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); - UI != UE; ++UI) { - LoadInst *LI = dyn_cast<LoadInst>(*UI); - if (LI == 0 || !LI->isSimple()) return false; - - // Both operands to the select need to be dereferencable, either - // absolutely (e.g. allocas) or at this point because we can see other - // accesses to it. - if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI, - LI->getAlignment(), &TD)) - return false; - if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI, - LI->getAlignment(), &TD)) - return false; - Loads.push_back(LI); - } +/// Select instructions that use an alloca and are subsequently loaded can be +/// rewritten to load both input pointers and then select between the result, +/// allowing the load of the alloca to be promoted. +/// From this: +/// %P2 = select i1 %cond, i32* %Alloca, i32* %Other +/// %V = load i32* %P2 +/// to: +/// %V1 = load i32* %Alloca -> will be mem2reg'd +/// %V2 = load i32* %Other +/// %V = select i1 %cond, i32 %V1, i32 %V2 +/// +/// We can do this to a select if its only uses are loads and if the operand +/// to the select can be loaded unconditionally. +static bool isSafeSelectToSpeculate(SelectInst &SI, const DataLayout *DL = 0) { + Value *TValue = SI.getTrueValue(); + Value *FValue = SI.getFalseValue(); + bool TDerefable = TValue->isDereferenceablePointer(); + bool FDerefable = FValue->isDereferenceablePointer(); + + for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); UI != UE; + ++UI) { + LoadInst *LI = dyn_cast<LoadInst>(*UI); + if (LI == 0 || !LI->isSimple()) + return false; - return true; + // Both operands to the select need to be dereferencable, either + // absolutely (e.g. allocas) or at this point because we can see other + // accesses to it. + if (!TDerefable && + !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment(), DL)) + return false; + if (!FDerefable && + !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment(), DL)) + return false; } - void visitSelectInst(SelectInst &SI) { - DEBUG(dbgs() << " original: " << SI << "\n"); - - // If the select isn't safe to speculate, just use simple logic to emit it. - SmallVector<LoadInst *, 4> Loads; - if (!isSafeSelectToSpeculate(SI, Loads)) - return; + return true; +} - IRBuilderTy IRB(&SI); - Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) }; - AllocaPartitioning::iterator PIs[2]; - PartitionUse PUs[2]; - for (unsigned i = 0, e = 2; i != e; ++i) { - PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]); - if (PIs[i] != P.end()) { - // If the pointer is within the partitioning, remove the select from - // its uses. We'll add in the new loads below. - AllocaPartitioning::use_iterator UI - = P.findPartitionUseForPHIOrSelectOperand(Ops[i]); - PUs[i] = *UI; - // Clear out the use here so that the offsets into the use list remain - // stable but this use is ignored when rewriting. - UI->setUse(0); - } - } +static void speculateSelectInstLoads(SelectInst &SI) { + DEBUG(dbgs() << " original: " << SI << "\n"); - Value *TV = SI.getTrueValue(); - Value *FV = SI.getFalseValue(); - // Replace the loads of the select with a select of two loads. - while (!Loads.empty()) { - LoadInst *LI = Loads.pop_back_val(); + IRBuilderTy IRB(&SI); + Value *TV = SI.getTrueValue(); + Value *FV = SI.getFalseValue(); + // Replace the loads of the select with a select of two loads. + while (!SI.use_empty()) { + LoadInst *LI = cast<LoadInst>(*SI.use_begin()); + assert(LI->isSimple() && "We only speculate simple loads"); - IRB.SetInsertPoint(LI); - LoadInst *TL = + IRB.SetInsertPoint(LI); + LoadInst *TL = IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true"); - LoadInst *FL = + LoadInst *FL = IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false"); - NumLoadsSpeculated += 2; - - // Transfer alignment and TBAA info if present. - TL->setAlignment(LI->getAlignment()); - FL->setAlignment(LI->getAlignment()); - if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { - TL->setMetadata(LLVMContext::MD_tbaa, Tag); - FL->setMetadata(LLVMContext::MD_tbaa, Tag); - } + NumLoadsSpeculated += 2; - Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, - LI->getName() + ".sroa.speculated"); + // Transfer alignment and TBAA info if present. + TL->setAlignment(LI->getAlignment()); + FL->setAlignment(LI->getAlignment()); + if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { + TL->setMetadata(LLVMContext::MD_tbaa, Tag); + FL->setMetadata(LLVMContext::MD_tbaa, Tag); + } - LoadInst *Loads[2] = { TL, FL }; - for (unsigned i = 0, e = 2; i != e; ++i) { - if (PIs[i] != P.end()) { - Use *LoadUse = &Loads[i]->getOperandUse(0); - assert(PUs[i].getUse()->get() == LoadUse->get()); - PUs[i].setUse(LoadUse); - P.use_push_back(PIs[i], PUs[i]); - } - } + Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, + LI->getName() + ".sroa.speculated"); - DEBUG(dbgs() << " speculated to: " << *V << "\n"); - LI->replaceAllUsesWith(V); - Pass.DeadInsts.insert(LI); - } + DEBUG(dbgs() << " speculated to: " << *V << "\n"); + LI->replaceAllUsesWith(V); + LI->eraseFromParent(); } -}; + SI.eraseFromParent(); } /// \brief Build a GEP out of a base pointer and indices. @@ -1737,7 +1209,7 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, /// TargetTy. If we can't find one with the same type, we at least try to use /// one with the same size. If none of that works, we just produce the GEP as /// indicated by Indices to have the correct offset. -static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD, +static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, Value *BasePtr, Type *Ty, Type *TargetTy, SmallVectorImpl<Value *> &Indices) { if (Ty == TargetTy) @@ -1754,7 +1226,7 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD, ElementTy = SeqTy->getElementType(); // Note that we use the default address space as this index is over an // array or a vector, not a pointer. - Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0))); + Indices.push_back(IRB.getInt(APInt(DL.getPointerSizeInBits(0), 0))); } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { if (STy->element_begin() == STy->element_end()) break; // Nothing left to descend into. @@ -1775,12 +1247,12 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD, /// /// This is the recursive step for getNaturalGEPWithOffset that walks down the /// element types adding appropriate indices for the GEP. -static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD, +static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, Type *Ty, APInt &Offset, Type *TargetTy, SmallVectorImpl<Value *> &Indices) { if (Offset == 0) - return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices); + return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices); // We can't recurse through pointer types. if (Ty->isPointerTy()) @@ -1790,7 +1262,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD, // extremely poorly defined currently. The long-term goal is to remove GEPing // over a vector from the IR completely. if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { - unsigned ElementSizeInBits = TD.getTypeSizeInBits(VecTy->getScalarType()); + unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType()); if (ElementSizeInBits % 8) return 0; // GEPs over non-multiple of 8 size vector elements are invalid. APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); @@ -1799,20 +1271,20 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD, return 0; Offset -= NumSkippedElements * ElementSize; Indices.push_back(IRB.getInt(NumSkippedElements)); - return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(), + return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(), Offset, TargetTy, Indices); } if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { Type *ElementTy = ArrTy->getElementType(); - APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); + APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy)); APInt NumSkippedElements = Offset.sdiv(ElementSize); if (NumSkippedElements.ugt(ArrTy->getNumElements())) return 0; Offset -= NumSkippedElements * ElementSize; Indices.push_back(IRB.getInt(NumSkippedElements)); - return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, + return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, Indices); } @@ -1820,18 +1292,18 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD, if (!STy) return 0; - const StructLayout *SL = TD.getStructLayout(STy); + const StructLayout *SL = DL.getStructLayout(STy); uint64_t StructOffset = Offset.getZExtValue(); if (StructOffset >= SL->getSizeInBytes()) return 0; unsigned Index = SL->getElementContainingOffset(StructOffset); Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); Type *ElementTy = STy->getElementType(Index); - if (Offset.uge(TD.getTypeAllocSize(ElementTy))) + if (Offset.uge(DL.getTypeAllocSize(ElementTy))) return 0; // The offset points into alignment padding. Indices.push_back(IRB.getInt32(Index)); - return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, + return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, Indices); } @@ -1845,7 +1317,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD, /// Indices, and setting Ty to the result subtype. /// /// If no natural GEP can be constructed, this function returns null. -static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD, +static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *TargetTy, SmallVectorImpl<Value *> &Indices) { PointerType *Ty = cast<PointerType>(Ptr->getType()); @@ -1858,14 +1330,14 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD, Type *ElementTy = Ty->getElementType(); if (!ElementTy->isSized()) return 0; // We can't GEP through an unsized element. - APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); + APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy)); if (ElementSize == 0) return 0; // Zero-length arrays can't help us build a natural GEP. APInt NumSkippedElements = Offset.sdiv(ElementSize); Offset -= NumSkippedElements * ElementSize; Indices.push_back(IRB.getInt(NumSkippedElements)); - return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, + return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, Indices); } @@ -1884,7 +1356,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD, /// properties. The algorithm tries to fold as many constant indices into /// a single GEP as possible, thus making each GEP more independent of the /// surrounding code. -static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD, +static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *PointerTy) { // Even though we don't look through PHI nodes, we could be called on an // instruction in an unreachable block, which may be on a cycle. @@ -1908,7 +1380,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD, // First fold any existing GEPs into the offset. while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) { APInt GEPOffset(Offset.getBitWidth(), 0); - if (!GEP->accumulateConstantOffset(TD, GEPOffset)) + if (!GEP->accumulateConstantOffset(DL, GEPOffset)) break; Offset += GEPOffset; Ptr = GEP->getPointerOperand(); @@ -1918,7 +1390,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD, // See if we can perform a natural GEP here. Indices.clear(); - if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy, + if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy, Indices)) { if (P->getType() == PointerTy) { // Zap any offset pointer that we ended up computing in previous rounds. @@ -1989,6 +1461,10 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) return false; + // We can convert pointers to integers and vice-versa. Same for vectors + // of pointers and integers. + OldTy = OldTy->getScalarType(); + NewTy = NewTy->getScalarType(); if (NewTy->isPointerTy() || OldTy->isPointerTy()) { if (NewTy->isPointerTy() && OldTy->isPointerTy()) return true; @@ -2007,24 +1483,126 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test /// two types for viability with this routine. static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, - Type *Ty) { - assert(canConvertValue(DL, V->getType(), Ty) && - "Value not convertable to type"); - if (V->getType() == Ty) + Type *NewTy) { + Type *OldTy = V->getType(); + assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type"); + + if (OldTy == NewTy) return V; - if (IntegerType *OldITy = dyn_cast<IntegerType>(V->getType())) - if (IntegerType *NewITy = dyn_cast<IntegerType>(Ty)) + + if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy)) + if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy)) if (NewITy->getBitWidth() > OldITy->getBitWidth()) return IRB.CreateZExt(V, NewITy); - if (V->getType()->isIntegerTy() && Ty->isPointerTy()) - return IRB.CreateIntToPtr(V, Ty); - if (V->getType()->isPointerTy() && Ty->isIntegerTy()) - return IRB.CreatePtrToInt(V, Ty); - return IRB.CreateBitCast(V, Ty); + // See if we need inttoptr for this type pair. A cast involving both scalars + // and vectors requires and additional bitcast. + if (OldTy->getScalarType()->isIntegerTy() && + NewTy->getScalarType()->isPointerTy()) { + // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8* + if (OldTy->isVectorTy() && !NewTy->isVectorTy()) + return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), + NewTy); + + // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*> + if (!OldTy->isVectorTy() && NewTy->isVectorTy()) + return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), + NewTy); + + return IRB.CreateIntToPtr(V, NewTy); + } + + // See if we need ptrtoint for this type pair. A cast involving both scalars + // and vectors requires and additional bitcast. + if (OldTy->getScalarType()->isPointerTy() && + NewTy->getScalarType()->isIntegerTy()) { + // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128 + if (OldTy->isVectorTy() && !NewTy->isVectorTy()) + return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), + NewTy); + + // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32> + if (!OldTy->isVectorTy() && NewTy->isVectorTy()) + return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), + NewTy); + + return IRB.CreatePtrToInt(V, NewTy); + } + + return IRB.CreateBitCast(V, NewTy); } -/// \brief Test whether the given alloca partition can be promoted to a vector. +/// \brief Test whether the given slice use can be promoted to a vector. +/// +/// This function is called to test each entry in a partioning which is slated +/// for a single slice. +static bool isVectorPromotionViableForSlice( + const DataLayout &DL, AllocaSlices &S, uint64_t SliceBeginOffset, + uint64_t SliceEndOffset, VectorType *Ty, uint64_t ElementSize, + AllocaSlices::const_iterator I) { + // First validate the slice offsets. + uint64_t BeginOffset = + std::max(I->beginOffset(), SliceBeginOffset) - SliceBeginOffset; + uint64_t BeginIndex = BeginOffset / ElementSize; + if (BeginIndex * ElementSize != BeginOffset || + BeginIndex >= Ty->getNumElements()) + return false; + uint64_t EndOffset = + std::min(I->endOffset(), SliceEndOffset) - SliceBeginOffset; + uint64_t EndIndex = EndOffset / ElementSize; + if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements()) + return false; + + assert(EndIndex > BeginIndex && "Empty vector!"); + uint64_t NumElements = EndIndex - BeginIndex; + Type *SliceTy = + (NumElements == 1) ? Ty->getElementType() + : VectorType::get(Ty->getElementType(), NumElements); + + Type *SplitIntTy = + Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8); + + Use *U = I->getUse(); + + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { + if (MI->isVolatile()) + return false; + if (!I->isSplittable()) + return false; // Skip any unsplittable intrinsics. + } else if (U->get()->getType()->getPointerElementType()->isStructTy()) { + // Disable vector promotion when there are loads or stores of an FCA. + return false; + } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { + if (LI->isVolatile()) + return false; + Type *LTy = LI->getType(); + if (SliceBeginOffset > I->beginOffset() || + SliceEndOffset < I->endOffset()) { + assert(LTy->isIntegerTy()); + LTy = SplitIntTy; + } + if (!canConvertValue(DL, SliceTy, LTy)) + return false; + } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { + if (SI->isVolatile()) + return false; + Type *STy = SI->getValueOperand()->getType(); + if (SliceBeginOffset > I->beginOffset() || + SliceEndOffset < I->endOffset()) { + assert(STy->isIntegerTy()); + STy = SplitIntTy; + } + if (!canConvertValue(DL, STy, SliceTy)) + return false; + } else { + return false; + } + + return true; +} + +/// \brief Test whether the given alloca partitioning and range of slices can be +/// promoted to a vector. /// /// This is a quick test to check whether we can rewrite a particular alloca /// partition (and its newly formed alloca) into a vector alloca with only @@ -2032,75 +1610,103 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, /// SSA value. We only can ensure this for a limited set of operations, and we /// don't want to do the rewrites unless we are confident that the result will /// be promotable, so we have an early test here. -static bool isVectorPromotionViable(const DataLayout &TD, - Type *AllocaTy, - AllocaPartitioning &P, - uint64_t PartitionBeginOffset, - uint64_t PartitionEndOffset, - AllocaPartitioning::const_use_iterator I, - AllocaPartitioning::const_use_iterator E) { +static bool +isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy, AllocaSlices &S, + uint64_t SliceBeginOffset, uint64_t SliceEndOffset, + AllocaSlices::const_iterator I, + AllocaSlices::const_iterator E, + ArrayRef<AllocaSlices::iterator> SplitUses) { VectorType *Ty = dyn_cast<VectorType>(AllocaTy); if (!Ty) return false; - uint64_t ElementSize = TD.getTypeSizeInBits(Ty->getScalarType()); + uint64_t ElementSize = DL.getTypeSizeInBits(Ty->getScalarType()); // While the definition of LLVM vectors is bitpacked, we don't support sizes // that aren't byte sized. if (ElementSize % 8) return false; - assert((TD.getTypeSizeInBits(Ty) % 8) == 0 && + assert((DL.getTypeSizeInBits(Ty) % 8) == 0 && "vector size not a multiple of element size?"); ElementSize /= 8; - for (; I != E; ++I) { - Use *U = I->getUse(); - if (!U) - continue; // Skip dead use. - - uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset; - uint64_t BeginIndex = BeginOffset / ElementSize; - if (BeginIndex * ElementSize != BeginOffset || - BeginIndex >= Ty->getNumElements()) + for (; I != E; ++I) + if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset, + SliceEndOffset, Ty, ElementSize, I)) return false; - uint64_t EndOffset = I->EndOffset - PartitionBeginOffset; - uint64_t EndIndex = EndOffset / ElementSize; - if (EndIndex * ElementSize != EndOffset || - EndIndex > Ty->getNumElements()) + + for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(), + SUE = SplitUses.end(); + SUI != SUE; ++SUI) + if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset, + SliceEndOffset, Ty, ElementSize, *SUI)) return false; - assert(EndIndex > BeginIndex && "Empty vector!"); - uint64_t NumElements = EndIndex - BeginIndex; - Type *PartitionTy - = (NumElements == 1) ? Ty->getElementType() - : VectorType::get(Ty->getElementType(), NumElements); + return true; +} - if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { - if (MI->isVolatile()) - return false; - if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) { - const AllocaPartitioning::MemTransferOffsets &MTO - = P.getMemTransferOffsets(*MTI); - if (!MTO.IsSplittable) - return false; - } - } else if (U->get()->getType()->getPointerElementType()->isStructTy()) { - // Disable vector promotion when there are loads or stores of an FCA. +/// \brief Test whether a slice of an alloca is valid for integer widening. +/// +/// This implements the necessary checking for the \c isIntegerWideningViable +/// test below on a single slice of the alloca. +static bool isIntegerWideningViableForSlice(const DataLayout &DL, + Type *AllocaTy, + uint64_t AllocBeginOffset, + uint64_t Size, AllocaSlices &S, + AllocaSlices::const_iterator I, + bool &WholeAllocaOp) { + uint64_t RelBegin = I->beginOffset() - AllocBeginOffset; + uint64_t RelEnd = I->endOffset() - AllocBeginOffset; + + // We can't reasonably handle cases where the load or store extends past + // the end of the aloca's type and into its padding. + if (RelEnd > Size) + return false; + + Use *U = I->getUse(); + + if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { + if (LI->isVolatile()) return false; - } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { - if (LI->isVolatile()) - return false; - if (!canConvertValue(TD, PartitionTy, LI->getType())) - return false; - } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { - if (SI->isVolatile()) + if (RelBegin == 0 && RelEnd == Size) + WholeAllocaOp = true; + if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { + if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy)) return false; - if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy)) + } else if (RelBegin != 0 || RelEnd != Size || + !canConvertValue(DL, AllocaTy, LI->getType())) { + // Non-integer loads need to be convertible from the alloca type so that + // they are promotable. + return false; + } + } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { + Type *ValueTy = SI->getValueOperand()->getType(); + if (SI->isVolatile()) + return false; + if (RelBegin == 0 && RelEnd == Size) + WholeAllocaOp = true; + if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { + if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy)) return false; - } else { + } else if (RelBegin != 0 || RelEnd != Size || + !canConvertValue(DL, ValueTy, AllocaTy)) { + // Non-integer stores need to be convertible to the alloca type so that + // they are promotable. return false; } + } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { + if (MI->isVolatile() || !isa<Constant>(MI->getLength())) + return false; + if (!I->isSplittable()) + return false; // Skip any unsplittable intrinsics. + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { + if (II->getIntrinsicID() != Intrinsic::lifetime_start && + II->getIntrinsicID() != Intrinsic::lifetime_end) + return false; + } else { + return false; } + return true; } @@ -2110,97 +1716,50 @@ static bool isVectorPromotionViable(const DataLayout &TD, /// This is a quick test to check whether we can rewrite the integer loads and /// stores to a particular alloca into wider loads and stores and be able to /// promote the resulting alloca. -static bool isIntegerWideningViable(const DataLayout &TD, - Type *AllocaTy, - uint64_t AllocBeginOffset, - AllocaPartitioning &P, - AllocaPartitioning::const_use_iterator I, - AllocaPartitioning::const_use_iterator E) { - uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy); +static bool +isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy, + uint64_t AllocBeginOffset, AllocaSlices &S, + AllocaSlices::const_iterator I, + AllocaSlices::const_iterator E, + ArrayRef<AllocaSlices::iterator> SplitUses) { + uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy); // Don't create integer types larger than the maximum bitwidth. if (SizeInBits > IntegerType::MAX_INT_BITS) return false; // Don't try to handle allocas with bit-padding. - if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy)) + if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy)) return false; // We need to ensure that an integer type with the appropriate bitwidth can // be converted to the alloca type, whatever that is. We don't want to force // the alloca itself to have an integer type if there is a more suitable one. Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits); - if (!canConvertValue(TD, AllocaTy, IntTy) || - !canConvertValue(TD, IntTy, AllocaTy)) + if (!canConvertValue(DL, AllocaTy, IntTy) || + !canConvertValue(DL, IntTy, AllocaTy)) return false; - uint64_t Size = TD.getTypeStoreSize(AllocaTy); - - // Check the uses to ensure the uses are (likely) promotable integer uses. - // Also ensure that the alloca has a covering load or store. We don't want - // to widen the integer operations only to fail to promote due to some other - // unsplittable entry (which we may make splittable later). - bool WholeAllocaOp = false; - for (; I != E; ++I) { - Use *U = I->getUse(); - if (!U) - continue; // Skip dead use. + uint64_t Size = DL.getTypeStoreSize(AllocaTy); - uint64_t RelBegin = I->BeginOffset - AllocBeginOffset; - uint64_t RelEnd = I->EndOffset - AllocBeginOffset; + // While examining uses, we ensure that the alloca has a covering load or + // store. We don't want to widen the integer operations only to fail to + // promote due to some other unsplittable entry (which we may make splittable + // later). However, if there are only splittable uses, go ahead and assume + // that we cover the alloca. + bool WholeAllocaOp = (I != E) ? false : DL.isLegalInteger(SizeInBits); - // We can't reasonably handle cases where the load or store extends past - // the end of the aloca's type and into its padding. - if (RelEnd > Size) + for (; I != E; ++I) + if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size, + S, I, WholeAllocaOp)) return false; - if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { - if (LI->isVolatile()) - return false; - if (RelBegin == 0 && RelEnd == Size) - WholeAllocaOp = true; - if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { - if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy)) - return false; - continue; - } - // Non-integer loads need to be convertible from the alloca type so that - // they are promotable. - if (RelBegin != 0 || RelEnd != Size || - !canConvertValue(TD, AllocaTy, LI->getType())) - return false; - } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { - Type *ValueTy = SI->getValueOperand()->getType(); - if (SI->isVolatile()) - return false; - if (RelBegin == 0 && RelEnd == Size) - WholeAllocaOp = true; - if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { - if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy)) - return false; - continue; - } - // Non-integer stores need to be convertible to the alloca type so that - // they are promotable. - if (RelBegin != 0 || RelEnd != Size || - !canConvertValue(TD, ValueTy, AllocaTy)) - return false; - } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { - if (MI->isVolatile() || !isa<Constant>(MI->getLength())) - return false; - if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) { - const AllocaPartitioning::MemTransferOffsets &MTO - = P.getMemTransferOffsets(*MTI); - if (!MTO.IsSplittable) - return false; - } - } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { - if (II->getIntrinsicID() != Intrinsic::lifetime_start && - II->getIntrinsicID() != Intrinsic::lifetime_end) - return false; - } else { + for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(), + SUE = SplitUses.end(); + SUI != SUE; ++SUI) + if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size, + S, *SUI, WholeAllocaOp)) return false; - } - } + return WholeAllocaOp; } @@ -2335,19 +1894,19 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, } namespace { -/// \brief Visitor to rewrite instructions using a partition of an alloca to -/// use a new alloca. +/// \brief Visitor to rewrite instructions using p particular slice of an alloca +/// to use a new alloca. /// /// Also implements the rewriting to vector-based accesses when the partition /// passes the isVectorPromotionViable predicate. Most of the rewriting logic /// lives here. -class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter, - bool> { +class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> { // Befriend the base class so it can delegate to private visit methods. - friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>; + friend class llvm::InstVisitor<AllocaSliceRewriter, bool>; + typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base; - const DataLayout &TD; - AllocaPartitioning &P; + const DataLayout &DL; + AllocaSlices &S; SROA &Pass; AllocaInst &OldAI, &NewAI; const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; @@ -2372,106 +1931,112 @@ class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter, // integer type will be stored here for easy access during rewriting. IntegerType *IntTy; - // The offset of the partition user currently being rewritten. + // The offset of the slice currently being rewritten. uint64_t BeginOffset, EndOffset; + bool IsSplittable; bool IsSplit; Use *OldUse; Instruction *OldPtr; + // Output members carrying state about the result of visiting and rewriting + // the slice of the alloca. + bool IsUsedByRewrittenSpeculatableInstructions; + // Utility IR builder, whose name prefix is setup for each visited use, and // the insertion point is set to point to the user. IRBuilderTy IRB; public: - AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P, - AllocaPartitioning::iterator PI, - SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI, - uint64_t NewBeginOffset, uint64_t NewEndOffset) - : TD(TD), P(P), Pass(Pass), - OldAI(OldAI), NewAI(NewAI), - NewAllocaBeginOffset(NewBeginOffset), - NewAllocaEndOffset(NewEndOffset), - NewAllocaTy(NewAI.getAllocatedType()), - VecTy(), ElementTy(), ElementSize(), IntTy(), - BeginOffset(), EndOffset(), IsSplit(), OldUse(), OldPtr(), - IRB(NewAI.getContext(), ConstantFolder()) { - } - - /// \brief Visit the users of the alloca partition and rewrite them. - bool visitUsers(AllocaPartitioning::const_use_iterator I, - AllocaPartitioning::const_use_iterator E) { - if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P, - NewAllocaBeginOffset, NewAllocaEndOffset, - I, E)) { - ++NumVectorized; - VecTy = cast<VectorType>(NewAI.getAllocatedType()); - ElementTy = VecTy->getElementType(); - assert((TD.getTypeSizeInBits(VecTy->getScalarType()) % 8) == 0 && + AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &S, SROA &Pass, + AllocaInst &OldAI, AllocaInst &NewAI, + uint64_t NewBeginOffset, uint64_t NewEndOffset, + bool IsVectorPromotable = false, + bool IsIntegerPromotable = false) + : DL(DL), S(S), Pass(Pass), OldAI(OldAI), NewAI(NewAI), + NewAllocaBeginOffset(NewBeginOffset), NewAllocaEndOffset(NewEndOffset), + NewAllocaTy(NewAI.getAllocatedType()), + VecTy(IsVectorPromotable ? cast<VectorType>(NewAllocaTy) : 0), + ElementTy(VecTy ? VecTy->getElementType() : 0), + ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0), + IntTy(IsIntegerPromotable + ? Type::getIntNTy( + NewAI.getContext(), + DL.getTypeSizeInBits(NewAI.getAllocatedType())) + : 0), + BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(), + OldPtr(), IsUsedByRewrittenSpeculatableInstructions(false), + IRB(NewAI.getContext(), ConstantFolder()) { + if (VecTy) { + assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 && "Only multiple-of-8 sized vector elements are viable"); - ElementSize = TD.getTypeSizeInBits(VecTy->getScalarType()) / 8; - } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(), - NewAllocaBeginOffset, P, I, E)) { - IntTy = Type::getIntNTy(NewAI.getContext(), - TD.getTypeSizeInBits(NewAI.getAllocatedType())); + ++NumVectorized; } + assert((!IsVectorPromotable && !IsIntegerPromotable) || + IsVectorPromotable != IsIntegerPromotable); + } + + bool visit(AllocaSlices::const_iterator I) { bool CanSROA = true; - for (; I != E; ++I) { - if (!I->getUse()) - continue; // Skip dead uses. - BeginOffset = I->BeginOffset; - EndOffset = I->EndOffset; - IsSplit = I->isSplit(); - OldUse = I->getUse(); - OldPtr = cast<Instruction>(OldUse->get()); - - Instruction *OldUserI = cast<Instruction>(OldUse->getUser()); - IRB.SetInsertPoint(OldUserI); - IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc()); - IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + - "."); - - CanSROA &= visit(cast<Instruction>(OldUse->getUser())); - } - if (VecTy) { + BeginOffset = I->beginOffset(); + EndOffset = I->endOffset(); + IsSplittable = I->isSplittable(); + IsSplit = + BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset; + + OldUse = I->getUse(); + OldPtr = cast<Instruction>(OldUse->get()); + + Instruction *OldUserI = cast<Instruction>(OldUse->getUser()); + IRB.SetInsertPoint(OldUserI); + IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc()); + IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + "."); + + CanSROA &= visit(cast<Instruction>(OldUse->getUser())); + if (VecTy || IntTy) assert(CanSROA); - VecTy = 0; - ElementTy = 0; - ElementSize = 0; - } - if (IntTy) { - assert(CanSROA); - IntTy = 0; - } return CanSROA; } + /// \brief Query whether this slice is used by speculatable instructions after + /// rewriting. + /// + /// These instructions (PHIs and Selects currently) require the alloca slice + /// to run back through the rewriter. Thus, they are promotable, but not on + /// this iteration. This is distinct from a slice which is unpromotable for + /// some other reason, in which case we don't even want to perform the + /// speculation. This can be querried at any time and reflects whether (at + /// that point) a visit call has rewritten a speculatable instruction on the + /// current slice. + bool isUsedByRewrittenSpeculatableInstructions() const { + return IsUsedByRewrittenSpeculatableInstructions; + } + private: + // Make sure the other visit overloads are visible. + using Base::visit; + // Every instruction which can end up as a user must have a rewrite rule. bool visitInstruction(Instruction &I) { DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); llvm_unreachable("No rewrite rule for this instruction!"); } - Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, Type *PointerTy) { - assert(BeginOffset >= NewAllocaBeginOffset); - APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset); - return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy); + Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, uint64_t Offset, + Type *PointerTy) { + assert(Offset >= NewAllocaBeginOffset); + return getAdjustedPtr(IRB, DL, &NewAI, APInt(DL.getPointerSizeInBits(), + Offset - NewAllocaBeginOffset), + PointerTy); } /// \brief Compute suitable alignment to access an offset into the new alloca. unsigned getOffsetAlign(uint64_t Offset) { unsigned NewAIAlign = NewAI.getAlignment(); if (!NewAIAlign) - NewAIAlign = TD.getABITypeAlignment(NewAI.getAllocatedType()); + NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType()); return MinAlign(NewAIAlign, Offset); } - /// \brief Compute suitable alignment to access this partition of the new - /// alloca. - unsigned getPartitionAlign() { - return getOffsetAlign(BeginOffset - NewAllocaBeginOffset); - } - /// \brief Compute suitable alignment to access a type at an offset of the /// new alloca. /// @@ -2479,15 +2044,7 @@ private: /// otherwise returns the maximal suitable alignment. unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) { unsigned Align = getOffsetAlign(Offset); - return Align == TD.getABITypeAlignment(Ty) ? 0 : Align; - } - - /// \brief Compute suitable alignment to access a type at the beginning of - /// this partition of the new alloca. - /// - /// See \c getOffsetTypeAlign for details; this routine delegates to it. - unsigned getPartitionTypeAlign(Type *Ty) { - return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset); + return Align == DL.getABITypeAlignment(Ty) ? 0 : Align; } unsigned getIndex(uint64_t Offset) { @@ -2505,9 +2062,10 @@ private: Pass.DeadInsts.insert(I); } - Value *rewriteVectorizedLoadInst() { - unsigned BeginIndex = getIndex(BeginOffset); - unsigned EndIndex = getIndex(EndOffset); + Value *rewriteVectorizedLoadInst(uint64_t NewBeginOffset, + uint64_t NewEndOffset) { + unsigned BeginIndex = getIndex(NewBeginOffset); + unsigned EndIndex = getIndex(NewEndOffset); assert(EndIndex > BeginIndex && "Empty vector!"); Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), @@ -2515,16 +2073,17 @@ private: return extractVector(IRB, V, BeginIndex, EndIndex, "vec"); } - Value *rewriteIntegerLoad(LoadInst &LI) { + Value *rewriteIntegerLoad(LoadInst &LI, uint64_t NewBeginOffset, + uint64_t NewEndOffset) { assert(IntTy && "We cannot insert an integer to the alloca"); assert(!LI.isVolatile()); Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); - V = convertValue(TD, IRB, V, IntTy); - assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); - uint64_t Offset = BeginOffset - NewAllocaBeginOffset; - if (Offset > 0 || EndOffset < NewAllocaEndOffset) - V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset, + V = convertValue(DL, IRB, V, IntTy); + assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; + if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) + V = extractInteger(DL, IRB, V, cast<IntegerType>(LI.getType()), Offset, "extract"); return V; } @@ -2534,37 +2093,44 @@ private: Value *OldOp = LI.getOperand(0); assert(OldOp == OldPtr); - uint64_t Size = EndOffset - BeginOffset; + // Compute the intersecting offset range. + assert(BeginOffset < NewAllocaEndOffset); + assert(EndOffset > NewAllocaBeginOffset); + uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); + uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); + + uint64_t Size = NewEndOffset - NewBeginOffset; Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), Size * 8) : LI.getType(); bool IsPtrAdjusted = false; Value *V; if (VecTy) { - V = rewriteVectorizedLoadInst(); + V = rewriteVectorizedLoadInst(NewBeginOffset, NewEndOffset); } else if (IntTy && LI.getType()->isIntegerTy()) { - V = rewriteIntegerLoad(LI); - } else if (BeginOffset == NewAllocaBeginOffset && - canConvertValue(TD, NewAllocaTy, LI.getType())) { + V = rewriteIntegerLoad(LI, NewBeginOffset, NewEndOffset); + } else if (NewBeginOffset == NewAllocaBeginOffset && + canConvertValue(DL, NewAllocaTy, LI.getType())) { V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(), "load"); } else { Type *LTy = TargetTy->getPointerTo(); - V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy), - getPartitionTypeAlign(TargetTy), - LI.isVolatile(), "load"); + V = IRB.CreateAlignedLoad( + getAdjustedAllocaPtr(IRB, NewBeginOffset, LTy), + getOffsetTypeAlign(TargetTy, NewBeginOffset - NewAllocaBeginOffset), + LI.isVolatile(), "load"); IsPtrAdjusted = true; } - V = convertValue(TD, IRB, V, TargetTy); + V = convertValue(DL, IRB, V, TargetTy); if (IsSplit) { assert(!LI.isVolatile()); assert(LI.getType()->isIntegerTy() && "Only integer type loads and stores are split"); - assert(Size < TD.getTypeStoreSize(LI.getType()) && + assert(Size < DL.getTypeStoreSize(LI.getType()) && "Split load isn't smaller than original load"); assert(LI.getType()->getIntegerBitWidth() == - TD.getTypeStoreSizeInBits(LI.getType()) && + DL.getTypeStoreSizeInBits(LI.getType()) && "Non-byte-multiple bit width"); // Move the insertion point just past the load so that we can refer to it. IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI))); @@ -2574,7 +2140,7 @@ private: // LI only used for this computation. Value *Placeholder = new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); - V = insertInteger(TD, IRB, Placeholder, V, BeginOffset, + V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset, "insert"); LI.replaceAllUsesWith(V); Placeholder->replaceAllUsesWith(&LI); @@ -2589,24 +2155,26 @@ private: return !LI.isVolatile() && !IsPtrAdjusted; } - bool rewriteVectorizedStoreInst(Value *V, - StoreInst &SI, Value *OldOp) { - unsigned BeginIndex = getIndex(BeginOffset); - unsigned EndIndex = getIndex(EndOffset); - assert(EndIndex > BeginIndex && "Empty vector!"); - unsigned NumElements = EndIndex - BeginIndex; - assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); - Type *PartitionTy - = (NumElements == 1) ? ElementTy - : VectorType::get(ElementTy, NumElements); - if (V->getType() != PartitionTy) - V = convertValue(TD, IRB, V, PartitionTy); - - // Mix in the existing elements. - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "load"); - V = insertVector(IRB, Old, V, BeginIndex, "vec"); + bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp, + uint64_t NewBeginOffset, + uint64_t NewEndOffset) { + if (V->getType() != VecTy) { + unsigned BeginIndex = getIndex(NewBeginOffset); + unsigned EndIndex = getIndex(NewEndOffset); + assert(EndIndex > BeginIndex && "Empty vector!"); + unsigned NumElements = EndIndex - BeginIndex; + assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); + Type *SliceTy = + (NumElements == 1) ? ElementTy + : VectorType::get(ElementTy, NumElements); + if (V->getType() != SliceTy) + V = convertValue(DL, IRB, V, SliceTy); + // Mix in the existing elements. + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + "load"); + V = insertVector(IRB, Old, V, BeginIndex, "vec"); + } StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); Pass.DeadInsts.insert(&SI); @@ -2615,19 +2183,20 @@ private: return true; } - bool rewriteIntegerStore(Value *V, StoreInst &SI) { + bool rewriteIntegerStore(Value *V, StoreInst &SI, + uint64_t NewBeginOffset, uint64_t NewEndOffset) { assert(IntTy && "We cannot extract an integer from the alloca"); assert(!SI.isVolatile()); - if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { + if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); - Old = convertValue(TD, IRB, Old, IntTy); + Old = convertValue(DL, IRB, Old, IntTy); assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = BeginOffset - NewAllocaBeginOffset; - V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset, + V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert"); } - V = convertValue(TD, IRB, V, NewAllocaTy); + V = convertValue(DL, IRB, V, NewAllocaTy); StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); Pass.DeadInsts.insert(&SI); (void)Store; @@ -2648,37 +2217,45 @@ private: if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets())) Pass.PostPromotionWorklist.insert(AI); - uint64_t Size = EndOffset - BeginOffset; - if (Size < TD.getTypeStoreSize(V->getType())) { + // Compute the intersecting offset range. + assert(BeginOffset < NewAllocaEndOffset); + assert(EndOffset > NewAllocaBeginOffset); + uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); + uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); + + uint64_t Size = NewEndOffset - NewBeginOffset; + if (Size < DL.getTypeStoreSize(V->getType())) { assert(!SI.isVolatile()); - assert(IsSplit && "A seemingly split store isn't splittable"); assert(V->getType()->isIntegerTy() && "Only integer type loads and stores are split"); assert(V->getType()->getIntegerBitWidth() == - TD.getTypeStoreSizeInBits(V->getType()) && + DL.getTypeStoreSizeInBits(V->getType()) && "Non-byte-multiple bit width"); IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8); - V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset, + V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset, "extract"); } if (VecTy) - return rewriteVectorizedStoreInst(V, SI, OldOp); + return rewriteVectorizedStoreInst(V, SI, OldOp, NewBeginOffset, + NewEndOffset); if (IntTy && V->getType()->isIntegerTy()) - return rewriteIntegerStore(V, SI); + return rewriteIntegerStore(V, SI, NewBeginOffset, NewEndOffset); StoreInst *NewSI; - if (BeginOffset == NewAllocaBeginOffset && - EndOffset == NewAllocaEndOffset && - canConvertValue(TD, V->getType(), NewAllocaTy)) { - V = convertValue(TD, IRB, V, NewAllocaTy); + if (NewBeginOffset == NewAllocaBeginOffset && + NewEndOffset == NewAllocaEndOffset && + canConvertValue(DL, V->getType(), NewAllocaTy)) { + V = convertValue(DL, IRB, V, NewAllocaTy); NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), SI.isVolatile()); } else { - Value *NewPtr = getAdjustedAllocaPtr(IRB, V->getType()->getPointerTo()); - NewSI = IRB.CreateAlignedStore(V, NewPtr, - getPartitionTypeAlign(V->getType()), - SI.isVolatile()); + Value *NewPtr = getAdjustedAllocaPtr(IRB, NewBeginOffset, + V->getType()->getPointerTo()); + NewSI = IRB.CreateAlignedStore( + V, NewPtr, getOffsetTypeAlign( + V->getType(), NewBeginOffset - NewAllocaBeginOffset), + SI.isVolatile()); } (void)NewSI; Pass.DeadInsts.insert(&SI); @@ -2729,9 +2306,12 @@ private: // If the memset has a variable size, it cannot be split, just adjust the // pointer to the new alloca. if (!isa<Constant>(II.getLength())) { - II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); + assert(!IsSplit); + assert(BeginOffset >= NewAllocaBeginOffset); + II.setDest( + getAdjustedAllocaPtr(IRB, BeginOffset, II.getRawDest()->getType())); Type *CstTy = II.getAlignmentCst()->getType(); - II.setAlignment(ConstantInt::get(CstTy, getPartitionAlign())); + II.setAlignment(ConstantInt::get(CstTy, getOffsetAlign(BeginOffset))); deleteIfTriviallyDead(OldPtr); return false; @@ -2743,21 +2323,26 @@ private: Type *AllocaTy = NewAI.getAllocatedType(); Type *ScalarTy = AllocaTy->getScalarType(); + // Compute the intersecting offset range. + assert(BeginOffset < NewAllocaEndOffset); + assert(EndOffset > NewAllocaBeginOffset); + uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); + uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); + uint64_t SliceOffset = NewBeginOffset - NewAllocaBeginOffset; + // If this doesn't map cleanly onto the alloca type, and that type isn't // a single value type, just emit a memset. if (!VecTy && !IntTy && - (BeginOffset != NewAllocaBeginOffset || - EndOffset != NewAllocaEndOffset || + (BeginOffset > NewAllocaBeginOffset || + EndOffset < NewAllocaEndOffset || !AllocaTy->isSingleValueType() || - !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)) || - TD.getTypeSizeInBits(ScalarTy)%8 != 0)) { + !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) || + DL.getTypeSizeInBits(ScalarTy)%8 != 0)) { Type *SizeTy = II.getLength()->getType(); - Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); - CallInst *New - = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB, - II.getRawDest()->getType()), - II.getValue(), Size, getPartitionAlign(), - II.isVolatile()); + Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); + CallInst *New = IRB.CreateMemSet( + getAdjustedAllocaPtr(IRB, NewBeginOffset, II.getRawDest()->getType()), + II.getValue(), Size, getOffsetAlign(SliceOffset), II.isVolatile()); (void)New; DEBUG(dbgs() << " to: " << *New << "\n"); return false; @@ -2774,15 +2359,15 @@ private: // If this is a memset of a vectorized alloca, insert it. assert(ElementTy == ScalarTy); - unsigned BeginIndex = getIndex(BeginOffset); - unsigned EndIndex = getIndex(EndOffset); + unsigned BeginIndex = getIndex(NewBeginOffset); + unsigned EndIndex = getIndex(NewEndOffset); assert(EndIndex > BeginIndex && "Empty vector!"); unsigned NumElements = EndIndex - BeginIndex; assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); Value *Splat = - getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ElementTy) / 8); - Splat = convertValue(TD, IRB, Splat, ElementTy); + getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ElementTy) / 8); + Splat = convertValue(DL, IRB, Splat, ElementTy); if (NumElements > 1) Splat = getVectorSplat(Splat, NumElements); @@ -2794,32 +2379,31 @@ private: // set integer. assert(!II.isVolatile()); - uint64_t Size = EndOffset - BeginOffset; + uint64_t Size = NewEndOffset - NewBeginOffset; V = getIntegerSplat(II.getValue(), Size); if (IntTy && (BeginOffset != NewAllocaBeginOffset || EndOffset != NewAllocaBeginOffset)) { Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); - Old = convertValue(TD, IRB, Old, IntTy); - assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); - uint64_t Offset = BeginOffset - NewAllocaBeginOffset; - V = insertInteger(TD, IRB, Old, V, Offset, "insert"); + Old = convertValue(DL, IRB, Old, IntTy); + uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; + V = insertInteger(DL, IRB, Old, V, Offset, "insert"); } else { assert(V->getType() == IntTy && "Wrong type for an alloca wide integer!"); } - V = convertValue(TD, IRB, V, AllocaTy); + V = convertValue(DL, IRB, V, AllocaTy); } else { // Established these invariants above. - assert(BeginOffset == NewAllocaBeginOffset); - assert(EndOffset == NewAllocaEndOffset); + assert(NewBeginOffset == NewAllocaBeginOffset); + assert(NewEndOffset == NewAllocaEndOffset); - V = getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ScalarTy) / 8); + V = getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ScalarTy) / 8); if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy)) V = getVectorSplat(V, AllocaVecTy->getNumElements()); - V = convertValue(TD, IRB, V, AllocaTy); + V = convertValue(DL, IRB, V, AllocaTy); } Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), @@ -2835,21 +2419,25 @@ private: DEBUG(dbgs() << " original: " << II << "\n"); + // Compute the intersecting offset range. + assert(BeginOffset < NewAllocaEndOffset); + assert(EndOffset > NewAllocaBeginOffset); + uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); + uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); + assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr); bool IsDest = II.getRawDest() == OldPtr; - const AllocaPartitioning::MemTransferOffsets &MTO - = P.getMemTransferOffsets(II); - // Compute the relative offset within the transfer. - unsigned IntPtrWidth = TD.getPointerSizeInBits(); - APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin - : MTO.SourceBegin)); + unsigned IntPtrWidth = DL.getPointerSizeInBits(); + APInt RelOffset(IntPtrWidth, NewBeginOffset - BeginOffset); unsigned Align = II.getAlignment(); + uint64_t SliceOffset = NewBeginOffset - NewAllocaBeginOffset; if (Align > 1) - Align = MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(), - MinAlign(II.getAlignment(), getPartitionAlign())); + Align = + MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(), + MinAlign(II.getAlignment(), getOffsetAlign(SliceOffset))); // For unsplit intrinsics, we simply modify the source and destination // pointers in place. This isn't just an optimization, it is a matter of @@ -2858,12 +2446,14 @@ private: // a variable length. We may also be dealing with memmove instead of // memcpy, and so simply updating the pointers is the necessary for us to // update both source and dest of a single call. - if (!MTO.IsSplittable) { + if (!IsSplittable) { Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource(); if (IsDest) - II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); + II.setDest( + getAdjustedAllocaPtr(IRB, BeginOffset, II.getRawDest()->getType())); else - II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType())); + II.setSource(getAdjustedAllocaPtr(IRB, BeginOffset, + II.getRawSource()->getType())); Type *CstTy = II.getAlignmentCst()->getType(); II.setAlignment(ConstantInt::get(CstTy, Align)); @@ -2881,24 +2471,21 @@ private: // If this doesn't map cleanly onto the alloca type, and that type isn't // a single value type, just emit a memcpy. bool EmitMemCpy - = !VecTy && !IntTy && (BeginOffset != NewAllocaBeginOffset || - EndOffset != NewAllocaEndOffset || + = !VecTy && !IntTy && (BeginOffset > NewAllocaBeginOffset || + EndOffset < NewAllocaEndOffset || !NewAI.getAllocatedType()->isSingleValueType()); // If we're just going to emit a memcpy, the alloca hasn't changed, and the // size hasn't been shrunk based on analysis of the viable range, this is // a no-op. if (EmitMemCpy && &OldAI == &NewAI) { - uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin; - uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd; // Ensure the start lines up. - assert(BeginOffset == OrigBegin); - (void)OrigBegin; + assert(NewBeginOffset == BeginOffset); // Rewrite the size as needed. - if (EndOffset != OrigEnd) + if (NewEndOffset != EndOffset) II.setLength(ConstantInt::get(II.getLength()->getType(), - EndOffset - BeginOffset)); + NewEndOffset - NewBeginOffset)); return false; } // Record this instruction for deletion. @@ -2917,13 +2504,13 @@ private: // Compute the other pointer, folding as much as possible to produce // a single, simple GEP in most cases. - OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy); + OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, RelOffset, OtherPtrTy); - Value *OurPtr - = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType() - : II.getRawSource()->getType()); + Value *OurPtr = getAdjustedAllocaPtr( + IRB, NewBeginOffset, + IsDest ? II.getRawDest()->getType() : II.getRawSource()->getType()); Type *SizeTy = II.getLength()->getType(); - Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); + Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, @@ -2939,11 +2526,11 @@ private: if (!Align) Align = 1; - bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset && - EndOffset == NewAllocaEndOffset; - uint64_t Size = EndOffset - BeginOffset; - unsigned BeginIndex = VecTy ? getIndex(BeginOffset) : 0; - unsigned EndIndex = VecTy ? getIndex(EndOffset) : 0; + bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset && + NewEndOffset == NewAllocaEndOffset; + uint64_t Size = NewEndOffset - NewBeginOffset; + unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0; + unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0; unsigned NumElements = EndIndex - BeginIndex; IntegerType *SubIntTy = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0; @@ -2960,7 +2547,7 @@ private: OtherPtrTy = SubIntTy->getPointerTo(); } - Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy); + Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, RelOffset, OtherPtrTy); Value *DstPtr = &NewAI; if (!IsDest) std::swap(SrcPtr, DstPtr); @@ -2973,10 +2560,9 @@ private: } else if (IntTy && !IsWholeAlloca && !IsDest) { Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); - Src = convertValue(TD, IRB, Src, IntTy); - assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); - uint64_t Offset = BeginOffset - NewAllocaBeginOffset; - Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, "extract"); + Src = convertValue(DL, IRB, Src, IntTy); + uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; + Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract"); } else { Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(), "copyload"); @@ -2989,11 +2575,10 @@ private: } else if (IntTy && !IsWholeAlloca && IsDest) { Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); - Old = convertValue(TD, IRB, Old, IntTy); - assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); - uint64_t Offset = BeginOffset - NewAllocaBeginOffset; - Src = insertInteger(TD, IRB, Old, Src, Offset, "insert"); - Src = convertValue(TD, IRB, Src, NewAllocaTy); + Old = convertValue(DL, IRB, Old, IntTy); + uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; + Src = insertInteger(DL, IRB, Old, Src, Offset, "insert"); + Src = convertValue(DL, IRB, Src, NewAllocaTy); } StoreInst *Store = cast<StoreInst>( @@ -3009,13 +2594,20 @@ private: DEBUG(dbgs() << " original: " << II << "\n"); assert(II.getArgOperand(1) == OldPtr); + // Compute the intersecting offset range. + assert(BeginOffset < NewAllocaEndOffset); + assert(EndOffset > NewAllocaBeginOffset); + uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); + uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); + // Record this instruction for deletion. Pass.DeadInsts.insert(&II); ConstantInt *Size = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), - EndOffset - BeginOffset); - Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType()); + NewEndOffset - NewBeginOffset); + Value *Ptr = + getAdjustedAllocaPtr(IRB, NewBeginOffset, II.getArgOperand(1)->getType()); Value *New; if (II.getIntrinsicID() == Intrinsic::lifetime_start) New = IRB.CreateLifetimeStart(Ptr, Size); @@ -3029,30 +2621,45 @@ private: bool visitPHINode(PHINode &PN) { DEBUG(dbgs() << " original: " << PN << "\n"); + assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable"); + assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable"); // We would like to compute a new pointer in only one place, but have it be // as local as possible to the PHI. To do that, we re-use the location of // the old pointer, which necessarily must be in the right position to // dominate the PHI. - IRBuilderTy PtrBuilder(cast<Instruction>(OldPtr)); + IRBuilderTy PtrBuilder(OldPtr); PtrBuilder.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + "."); - Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType()); + Value *NewPtr = + getAdjustedAllocaPtr(PtrBuilder, BeginOffset, OldPtr->getType()); // Replace the operands which were using the old pointer. std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); DEBUG(dbgs() << " to: " << PN << "\n"); deleteIfTriviallyDead(OldPtr); - return false; + + // Check whether we can speculate this PHI node, and if so remember that + // fact and queue it up for another iteration after the speculation + // occurs. + if (isSafePHIToSpeculate(PN, &DL)) { + Pass.SpeculatablePHIs.insert(&PN); + IsUsedByRewrittenSpeculatableInstructions = true; + return true; + } + + return false; // PHIs can't be promoted on their own. } bool visitSelectInst(SelectInst &SI) { DEBUG(dbgs() << " original: " << SI << "\n"); assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) && "Pointer isn't an operand!"); + assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable"); + assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable"); - Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType()); + Value *NewPtr = getAdjustedAllocaPtr(IRB, BeginOffset, OldPtr->getType()); // Replace the operands which were using the old pointer. if (SI.getOperand(1) == OldPtr) SI.setOperand(1, NewPtr); @@ -3061,7 +2668,17 @@ private: DEBUG(dbgs() << " to: " << SI << "\n"); deleteIfTriviallyDead(OldPtr); - return false; + + // Check whether we can speculate this select instruction, and if so + // remember that fact and queue it up for another iteration after the + // speculation occurs. + if (isSafeSelectToSpeculate(SI, &DL)) { + Pass.SpeculatableSelects.insert(&SI); + IsUsedByRewrittenSpeculatableInstructions = true; + return true; + } + + return false; // Selects can't be promoted on their own. } }; @@ -3077,7 +2694,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { // Befriend the base class so it can delegate to private visit methods. friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; - const DataLayout &TD; + const DataLayout &DL; /// Queue of pointer uses to analyze and potentially rewrite. SmallVector<Use *, 8> Queue; @@ -3090,7 +2707,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { Use *U; public: - AggLoadStoreRewriter(const DataLayout &TD) : TD(TD) {} + AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {} /// Rewrite loads and stores through a pointer and all pointers derived from /// it. @@ -3319,12 +2936,12 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { /// when the size or offset cause either end of type-based partition to be off. /// Also, this is a best-effort routine. It is reasonable to give up and not /// return a type if necessary. -static Type *getTypePartition(const DataLayout &TD, Type *Ty, +static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, uint64_t Size) { - if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size) - return stripAggregateTypeWrapping(TD, Ty); - if (Offset > TD.getTypeAllocSize(Ty) || - (TD.getTypeAllocSize(Ty) - Offset) < Size) + if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size) + return stripAggregateTypeWrapping(DL, Ty); + if (Offset > DL.getTypeAllocSize(Ty) || + (DL.getTypeAllocSize(Ty) - Offset) < Size) return 0; if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) { @@ -3333,7 +2950,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty, return 0; Type *ElementTy = SeqTy->getElementType(); - uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); + uint64_t ElementSize = DL.getTypeAllocSize(ElementTy); uint64_t NumSkippedElements = Offset / ElementSize; if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) { if (NumSkippedElements >= ArrTy->getNumElements()) @@ -3350,12 +2967,12 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty, if ((Offset + Size) > ElementSize) return 0; // Recurse through the element type trying to peel off offset bytes. - return getTypePartition(TD, ElementTy, Offset, Size); + return getTypePartition(DL, ElementTy, Offset, Size); } assert(Offset == 0); if (Size == ElementSize) - return stripAggregateTypeWrapping(TD, ElementTy); + return stripAggregateTypeWrapping(DL, ElementTy); assert(Size > ElementSize); uint64_t NumElements = Size / ElementSize; if (NumElements * ElementSize != Size) @@ -3367,7 +2984,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty, if (!STy) return 0; - const StructLayout *SL = TD.getStructLayout(STy); + const StructLayout *SL = DL.getStructLayout(STy); if (Offset >= SL->getSizeInBytes()) return 0; uint64_t EndOffset = Offset + Size; @@ -3378,7 +2995,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty, Offset -= SL->getElementOffset(Index); Type *ElementTy = STy->getElementType(Index); - uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); + uint64_t ElementSize = DL.getTypeAllocSize(ElementTy); if (Offset >= ElementSize) return 0; // The offset points into alignment padding. @@ -3386,12 +3003,12 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty, if (Offset > 0 || Size < ElementSize) { if ((Offset + Size) > ElementSize) return 0; - return getTypePartition(TD, ElementTy, Offset, Size); + return getTypePartition(DL, ElementTy, Offset, Size); } assert(Offset == 0); if (Size == ElementSize) - return stripAggregateTypeWrapping(TD, ElementTy); + return stripAggregateTypeWrapping(DL, ElementTy); StructType::element_iterator EI = STy->element_begin() + Index, EE = STy->element_end(); @@ -3414,7 +3031,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty, // Try to build up a sub-structure. StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked()); - const StructLayout *SubSL = TD.getStructLayout(SubTy); + const StructLayout *SubSL = DL.getStructLayout(SubTy); if (Size != SubSL->getSizeInBytes()) return 0; // The sub-struct doesn't have quite the size needed. @@ -3431,113 +3048,280 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty, /// appropriate new offsets. It also evaluates how successful the rewrite was /// at enabling promotion and if it was successful queues the alloca to be /// promoted. -bool SROA::rewriteAllocaPartition(AllocaInst &AI, - AllocaPartitioning &P, - AllocaPartitioning::iterator PI) { - uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset; - bool IsLive = false; - for (AllocaPartitioning::use_iterator UI = P.use_begin(PI), - UE = P.use_end(PI); - UI != UE && !IsLive; ++UI) - if (UI->getUse()) - IsLive = true; - if (!IsLive) - return false; // No live uses left of this partition. - - DEBUG(dbgs() << "Speculating PHIs and selects in partition " - << "[" << PI->BeginOffset << "," << PI->EndOffset << ")\n"); - - PHIOrSelectSpeculator Speculator(*TD, P, *this); - DEBUG(dbgs() << " speculating "); - DEBUG(P.print(dbgs(), PI, "")); - Speculator.visitUsers(PI); +bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, + AllocaSlices::iterator B, AllocaSlices::iterator E, + int64_t BeginOffset, int64_t EndOffset, + ArrayRef<AllocaSlices::iterator> SplitUses) { + assert(BeginOffset < EndOffset); + uint64_t SliceSize = EndOffset - BeginOffset; // Try to compute a friendly type for this partition of the alloca. This // won't always succeed, in which case we fall back to a legal integer type // or an i8 array of an appropriate size. - Type *AllocaTy = 0; - if (Type *PartitionTy = P.getCommonType(PI)) - if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize) - AllocaTy = PartitionTy; - if (!AllocaTy) - if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(), - PI->BeginOffset, AllocaSize)) - AllocaTy = PartitionTy; - if ((!AllocaTy || - (AllocaTy->isArrayTy() && - AllocaTy->getArrayElementType()->isIntegerTy())) && - TD->isLegalInteger(AllocaSize * 8)) - AllocaTy = Type::getIntNTy(*C, AllocaSize * 8); - if (!AllocaTy) - AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize); - assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize); + Type *SliceTy = 0; + if (Type *CommonUseTy = findCommonType(B, E, EndOffset)) + if (DL->getTypeAllocSize(CommonUseTy) >= SliceSize) + SliceTy = CommonUseTy; + if (!SliceTy) + if (Type *TypePartitionTy = getTypePartition(*DL, AI.getAllocatedType(), + BeginOffset, SliceSize)) + SliceTy = TypePartitionTy; + if ((!SliceTy || (SliceTy->isArrayTy() && + SliceTy->getArrayElementType()->isIntegerTy())) && + DL->isLegalInteger(SliceSize * 8)) + SliceTy = Type::getIntNTy(*C, SliceSize * 8); + if (!SliceTy) + SliceTy = ArrayType::get(Type::getInt8Ty(*C), SliceSize); + assert(DL->getTypeAllocSize(SliceTy) >= SliceSize); + + bool IsVectorPromotable = isVectorPromotionViable( + *DL, SliceTy, S, BeginOffset, EndOffset, B, E, SplitUses); + + bool IsIntegerPromotable = + !IsVectorPromotable && + isIntegerWideningViable(*DL, SliceTy, BeginOffset, S, B, E, SplitUses); // Check for the case where we're going to rewrite to a new alloca of the // exact same type as the original, and with the same access offsets. In that // case, re-use the existing alloca, but still run through the rewriter to // perform phi and select speculation. AllocaInst *NewAI; - if (AllocaTy == AI.getAllocatedType()) { - assert(PI->BeginOffset == 0 && + if (SliceTy == AI.getAllocatedType()) { + assert(BeginOffset == 0 && "Non-zero begin offset but same alloca type"); - assert(PI == P.begin() && "Begin offset is zero on later partition"); NewAI = &AI; + // FIXME: We should be able to bail at this point with "nothing changed". + // FIXME: We might want to defer PHI speculation until after here. } else { unsigned Alignment = AI.getAlignment(); if (!Alignment) { // The minimum alignment which users can rely on when the explicit // alignment is omitted or zero is that required by the ABI for this // type. - Alignment = TD->getABITypeAlignment(AI.getAllocatedType()); + Alignment = DL->getABITypeAlignment(AI.getAllocatedType()); } - Alignment = MinAlign(Alignment, PI->BeginOffset); + Alignment = MinAlign(Alignment, BeginOffset); // If we will get at least this much alignment from the type alone, leave // the alloca's alignment unconstrained. - if (Alignment <= TD->getABITypeAlignment(AllocaTy)) + if (Alignment <= DL->getABITypeAlignment(SliceTy)) Alignment = 0; - NewAI = new AllocaInst(AllocaTy, 0, Alignment, - AI.getName() + ".sroa." + Twine(PI - P.begin()), - &AI); + NewAI = new AllocaInst(SliceTy, 0, Alignment, + AI.getName() + ".sroa." + Twine(B - S.begin()), &AI); ++NumNewAllocas; } DEBUG(dbgs() << "Rewriting alloca partition " - << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: " - << *NewAI << "\n"); + << "[" << BeginOffset << "," << EndOffset << ") to: " << *NewAI + << "\n"); - // Track the high watermark of the post-promotion worklist. We will reset it - // to this point if the alloca is not in fact scheduled for promotion. + // Track the high watermark on several worklists that are only relevant for + // promoted allocas. We will reset it to this point if the alloca is not in + // fact scheduled for promotion. unsigned PPWOldSize = PostPromotionWorklist.size(); + unsigned SPOldSize = SpeculatablePHIs.size(); + unsigned SSOldSize = SpeculatableSelects.size(); + unsigned NumUses = 0; + + AllocaSliceRewriter Rewriter(*DL, S, *this, AI, *NewAI, BeginOffset, + EndOffset, IsVectorPromotable, + IsIntegerPromotable); + bool Promotable = true; + for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(), + SUE = SplitUses.end(); + SUI != SUE; ++SUI) { + DEBUG(dbgs() << " rewriting split "); + DEBUG(S.printSlice(dbgs(), *SUI, "")); + Promotable &= Rewriter.visit(*SUI); + ++NumUses; + } + for (AllocaSlices::iterator I = B; I != E; ++I) { + DEBUG(dbgs() << " rewriting "); + DEBUG(S.printSlice(dbgs(), I, "")); + Promotable &= Rewriter.visit(I); + ++NumUses; + } + + NumAllocaPartitionUses += NumUses; + MaxUsesPerAllocaPartition = + std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition); - AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI, - PI->BeginOffset, PI->EndOffset); - DEBUG(dbgs() << " rewriting "); - DEBUG(P.print(dbgs(), PI, "")); - bool Promotable = Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI)); - if (Promotable) { + if (Promotable && !Rewriter.isUsedByRewrittenSpeculatableInstructions()) { DEBUG(dbgs() << " and queuing for promotion\n"); PromotableAllocas.push_back(NewAI); - } else if (NewAI != &AI) { + } else if (NewAI != &AI || + (Promotable && + Rewriter.isUsedByRewrittenSpeculatableInstructions())) { // If we can't promote the alloca, iterate on it to check for new // refinements exposed by splitting the current alloca. Don't iterate on an // alloca which didn't actually change and didn't get promoted. + // + // Alternatively, if we could promote the alloca but have speculatable + // instructions then we will speculate them after finishing our processing + // of the original alloca. Mark the new one for re-visiting in the next + // iteration so the speculated operations can be rewritten. + // + // FIXME: We should actually track whether the rewriter changed anything. Worklist.insert(NewAI); } // Drop any post-promotion work items if promotion didn't happen. - if (!Promotable) + if (!Promotable) { while (PostPromotionWorklist.size() > PPWOldSize) PostPromotionWorklist.pop_back(); + while (SpeculatablePHIs.size() > SPOldSize) + SpeculatablePHIs.pop_back(); + while (SpeculatableSelects.size() > SSOldSize) + SpeculatableSelects.pop_back(); + } return true; } -/// \brief Walks the partitioning of an alloca rewriting uses of each partition. -bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) { +namespace { +struct IsSliceEndLessOrEqualTo { + uint64_t UpperBound; + + IsSliceEndLessOrEqualTo(uint64_t UpperBound) : UpperBound(UpperBound) {} + + bool operator()(const AllocaSlices::iterator &I) { + return I->endOffset() <= UpperBound; + } +}; +} + +static void +removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses, + uint64_t &MaxSplitUseEndOffset, uint64_t Offset) { + if (Offset >= MaxSplitUseEndOffset) { + SplitUses.clear(); + MaxSplitUseEndOffset = 0; + return; + } + + size_t SplitUsesOldSize = SplitUses.size(); + SplitUses.erase(std::remove_if(SplitUses.begin(), SplitUses.end(), + IsSliceEndLessOrEqualTo(Offset)), + SplitUses.end()); + if (SplitUsesOldSize == SplitUses.size()) + return; + + // Recompute the max. While this is linear, so is remove_if. + MaxSplitUseEndOffset = 0; + for (SmallVectorImpl<AllocaSlices::iterator>::iterator + SUI = SplitUses.begin(), + SUE = SplitUses.end(); + SUI != SUE; ++SUI) + MaxSplitUseEndOffset = std::max((*SUI)->endOffset(), MaxSplitUseEndOffset); +} + +/// \brief Walks the slices of an alloca and form partitions based on them, +/// rewriting each of their uses. +bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &S) { + if (S.begin() == S.end()) + return false; + + unsigned NumPartitions = 0; bool Changed = false; - for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE; - ++PI) - Changed |= rewriteAllocaPartition(AI, P, PI); + SmallVector<AllocaSlices::iterator, 4> SplitUses; + uint64_t MaxSplitUseEndOffset = 0; + + uint64_t BeginOffset = S.begin()->beginOffset(); + + for (AllocaSlices::iterator SI = S.begin(), SJ = llvm::next(SI), SE = S.end(); + SI != SE; SI = SJ) { + uint64_t MaxEndOffset = SI->endOffset(); + + if (!SI->isSplittable()) { + // When we're forming an unsplittable region, it must always start at the + // first slice and will extend through its end. + assert(BeginOffset == SI->beginOffset()); + + // Form a partition including all of the overlapping slices with this + // unsplittable slice. + while (SJ != SE && SJ->beginOffset() < MaxEndOffset) { + if (!SJ->isSplittable()) + MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset()); + ++SJ; + } + } else { + assert(SI->isSplittable()); // Established above. + + // Collect all of the overlapping splittable slices. + while (SJ != SE && SJ->beginOffset() < MaxEndOffset && + SJ->isSplittable()) { + MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset()); + ++SJ; + } + + // Back up MaxEndOffset and SJ if we ended the span early when + // encountering an unsplittable slice. + if (SJ != SE && SJ->beginOffset() < MaxEndOffset) { + assert(!SJ->isSplittable()); + MaxEndOffset = SJ->beginOffset(); + } + } + + // Check if we have managed to move the end offset forward yet. If so, + // we'll have to rewrite uses and erase old split uses. + if (BeginOffset < MaxEndOffset) { + // Rewrite a sequence of overlapping slices. + Changed |= + rewritePartition(AI, S, SI, SJ, BeginOffset, MaxEndOffset, SplitUses); + ++NumPartitions; + + removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, MaxEndOffset); + } + + // Accumulate all the splittable slices from the [SI,SJ) region which + // overlap going forward. + for (AllocaSlices::iterator SK = SI; SK != SJ; ++SK) + if (SK->isSplittable() && SK->endOffset() > MaxEndOffset) { + SplitUses.push_back(SK); + MaxSplitUseEndOffset = std::max(SK->endOffset(), MaxSplitUseEndOffset); + } + + // If we're already at the end and we have no split uses, we're done. + if (SJ == SE && SplitUses.empty()) + break; + + // If we have no split uses or no gap in offsets, we're ready to move to + // the next slice. + if (SplitUses.empty() || (SJ != SE && MaxEndOffset == SJ->beginOffset())) { + BeginOffset = SJ->beginOffset(); + continue; + } + + // Even if we have split slices, if the next slice is splittable and the + // split slices reach it, we can simply set up the beginning offset of the + // next iteration to bridge between them. + if (SJ != SE && SJ->isSplittable() && + MaxSplitUseEndOffset > SJ->beginOffset()) { + BeginOffset = MaxEndOffset; + continue; + } + + // Otherwise, we have a tail of split slices. Rewrite them with an empty + // range of slices. + uint64_t PostSplitEndOffset = + SJ == SE ? MaxSplitUseEndOffset : SJ->beginOffset(); + + Changed |= rewritePartition(AI, S, SJ, SJ, MaxEndOffset, PostSplitEndOffset, + SplitUses); + ++NumPartitions; + + if (SJ == SE) + break; // Skip the rest, we don't need to do any cleanup. + + removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, + PostSplitEndOffset); + + // Now just reset the begin offset for the next iteration. + BeginOffset = SJ->beginOffset(); + } + + NumAllocaPartitions += NumPartitions; + MaxPartitionsPerAlloca = + std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca); return Changed; } @@ -3545,7 +3329,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) { /// \brief Analyze an alloca for SROA. /// /// This analyzes the alloca to ensure we can reason about it, builds -/// a partitioning of the alloca, and then hands it off to be split and +/// the slices of the alloca, and then hands it off to be split and /// rewritten as needed. bool SROA::runOnAlloca(AllocaInst &AI) { DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); @@ -3559,32 +3343,32 @@ bool SROA::runOnAlloca(AllocaInst &AI) { // Skip alloca forms that this analysis can't handle. if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || - TD->getTypeAllocSize(AI.getAllocatedType()) == 0) + DL->getTypeAllocSize(AI.getAllocatedType()) == 0) return false; bool Changed = false; // First, split any FCA loads and stores touching this alloca to promote // better splitting and promotion opportunities. - AggLoadStoreRewriter AggRewriter(*TD); + AggLoadStoreRewriter AggRewriter(*DL); Changed |= AggRewriter.rewrite(AI); - // Build the partition set using a recursive instruction-visiting builder. - AllocaPartitioning P(*TD, AI); - DEBUG(P.print(dbgs())); - if (P.isEscaped()) + // Build the slices using a recursive instruction-visiting builder. + AllocaSlices S(*DL, AI); + DEBUG(S.print(dbgs())); + if (S.isEscaped()) return Changed; // Delete all the dead users of this alloca before splitting and rewriting it. - for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(), - DE = P.dead_user_end(); + for (AllocaSlices::dead_user_iterator DI = S.dead_user_begin(), + DE = S.dead_user_end(); DI != DE; ++DI) { Changed = true; (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType())); DeadInsts.insert(*DI); } - for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(), - DE = P.dead_op_end(); + for (AllocaSlices::dead_op_iterator DO = S.dead_op_begin(), + DE = S.dead_op_end(); DO != DE; ++DO) { Value *OldV = **DO; // Clobber the use with an undef value. @@ -3596,11 +3380,21 @@ bool SROA::runOnAlloca(AllocaInst &AI) { } } - // No partitions to split. Leave the dead alloca for a later pass to clean up. - if (P.begin() == P.end()) + // No slices to split. Leave the dead alloca for a later pass to clean up. + if (S.begin() == S.end()) return Changed; - return splitAlloca(AI, P) || Changed; + Changed |= splitAlloca(AI, S); + + DEBUG(dbgs() << " Speculating PHIs\n"); + while (!SpeculatablePHIs.empty()) + speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val()); + + DEBUG(dbgs() << " Speculating Selects\n"); + while (!SpeculatableSelects.empty()) + speculateSelectInstLoads(*SpeculatableSelects.pop_back_val()); + + return Changed; } /// \brief Delete the dead instructions accumulated in this run. @@ -3635,6 +3429,15 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { } } +static void enqueueUsersInWorklist(Instruction &I, + SmallVectorImpl<Instruction *> &Worklist, + SmallPtrSet<Instruction *, 8> &Visited) { + for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; + ++UI) + if (Visited.insert(cast<Instruction>(*UI))) + Worklist.push_back(cast<Instruction>(*UI)); +} + /// \brief Promote the allocas, using the best available technique. /// /// This attempts to promote whatever allocas have been identified as viable in @@ -3659,25 +3462,28 @@ bool SROA::promoteAllocas(Function &F) { DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); SSAUpdater SSA; DIBuilder DIB(*F.getParent()); - SmallVector<Instruction*, 64> Insts; + SmallVector<Instruction *, 64> Insts; + + // We need a worklist to walk the uses of each alloca. + SmallVector<Instruction *, 8> Worklist; + SmallPtrSet<Instruction *, 8> Visited; + SmallVector<Instruction *, 32> DeadInsts; for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { AllocaInst *AI = PromotableAllocas[Idx]; - for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); - UI != UE;) { - Instruction *I = cast<Instruction>(*UI++); + Insts.clear(); + Worklist.clear(); + Visited.clear(); + + enqueueUsersInWorklist(*AI, Worklist, Visited); + + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + // FIXME: Currently the SSAUpdater infrastructure doesn't reason about // lifetime intrinsics and so we strip them (and the bitcasts+GEPs // leading to them) here. Eventually it should use them to optimize the // scalar values produced. - if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) { - assert(onlyUsedByLifetimeMarkers(I) && - "Found a bitcast used outside of a lifetime marker."); - while (!I->use_empty()) - cast<Instruction>(*I->use_begin())->eraseFromParent(); - I->eraseFromParent(); - continue; - } if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { assert(II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end); @@ -3685,10 +3491,30 @@ bool SROA::promoteAllocas(Function &F) { continue; } - Insts.push_back(I); + // Push the loads and stores we find onto the list. SROA will already + // have validated that all loads and stores are viable candidates for + // promotion. + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + assert(LI->getType() == AI->getAllocatedType()); + Insts.push_back(LI); + continue; + } + if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + assert(SI->getValueOperand()->getType() == AI->getAllocatedType()); + Insts.push_back(SI); + continue; + } + + // For everything else, we know that only no-op bitcasts and GEPs will + // make it this far, just recurse through them and recall them for later + // removal. + DeadInsts.push_back(I); + enqueueUsersInWorklist(*I, Worklist, Visited); } AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); - Insts.clear(); + while (!DeadInsts.empty()) + DeadInsts.pop_back_val()->eraseFromParent(); + AI->eraseFromParent(); } PromotableAllocas.clear(); @@ -3712,8 +3538,8 @@ namespace { bool SROA::runOnFunction(Function &F) { DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); - TD = getAnalysisIfAvailable<DataLayout>(); - if (!TD) { + DL = getAnalysisIfAvailable<DataLayout>(); + if (!DL) { DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); return false; } diff --git a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp b/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp new file mode 100644 index 0000000..9bcd702 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp @@ -0,0 +1,479 @@ +//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the SampleProfileLoader transformation. This pass +// reads a profile file generated by a sampling profiler (e.g. Linux Perf - +// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the +// profile information in the given profile. +// +// This pass generates branch weight annotations on the IR: +// +// - prof: Represents branch weights. This annotation is added to branches +// to indicate the weights of each edge coming out of the branch. +// The weight of each edge is the weight of the target block for +// that edge. The weight of a block B is computed as the maximum +// number of samples found in B. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "sample-profile" + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/DebugInfo/DIContext.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/InstIterator.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Regex.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; + +// Command line option to specify the file to read samples from. This is +// mainly used for debugging. +static cl::opt<std::string> SampleProfileFile( + "sample-profile-file", cl::init(""), cl::value_desc("filename"), + cl::desc("Profile file loaded by -sample-profile"), cl::Hidden); + +namespace { +/// \brief Sample-based profile reader. +/// +/// Each profile contains sample counts for all the functions +/// executed. Inside each function, statements are annotated with the +/// collected samples on all the instructions associated with that +/// statement. +/// +/// For this to produce meaningful data, the program needs to be +/// compiled with some debug information (at minimum, line numbers: +/// -gline-tables-only). Otherwise, it will be impossible to match IR +/// instructions to the line numbers collected by the profiler. +/// +/// From the profile file, we are interested in collecting the +/// following information: +/// +/// * A list of functions included in the profile (mangled names). +/// +/// * For each function F: +/// 1. The total number of samples collected in F. +/// +/// 2. The samples collected at each line in F. To provide some +/// protection against source code shuffling, line numbers should +/// be relative to the start of the function. +class SampleProfile { +public: + SampleProfile(StringRef F) : Profiles(0), Filename(F) {} + + void dump(); + void loadText(); + void loadNative() { llvm_unreachable("not implemented"); } + bool emitAnnotations(Function &F); + void printFunctionProfile(raw_ostream &OS, StringRef FName); + void dumpFunctionProfile(StringRef FName); + +protected: + typedef DenseMap<uint32_t, uint32_t> BodySampleMap; + typedef DenseMap<BasicBlock *, uint32_t> BlockWeightMap; + + /// \brief Representation of the runtime profile for a function. + /// + /// This data structure contains the runtime profile for a given + /// function. It contains the total number of samples collected + /// in the function and a map of samples collected in every statement. + struct FunctionProfile { + /// \brief Total number of samples collected inside this function. + /// + /// Samples are cumulative, they include all the samples collected + /// inside this function and all its inlined callees. + unsigned TotalSamples; + + // \brief Total number of samples collected at the head of the function. + unsigned TotalHeadSamples; + + /// \brief Map line offsets to collected samples. + /// + /// Each entry in this map contains the number of samples + /// collected at the corresponding line offset. All line locations + /// are an offset from the start of the function. + BodySampleMap BodySamples; + + /// \brief Map basic blocks to their computed weights. + /// + /// The weight of a basic block is defined to be the maximum + /// of all the instruction weights in that block. + BlockWeightMap BlockWeights; + }; + + uint32_t getInstWeight(Instruction &I, unsigned FirstLineno, + BodySampleMap &BodySamples); + uint32_t computeBlockWeight(BasicBlock *B, unsigned FirstLineno, + BodySampleMap &BodySamples); + + /// \brief Map every function to its associated profile. + /// + /// The profile of every function executed at runtime is collected + /// in the structure FunctionProfile. This maps function objects + /// to their corresponding profiles. + StringMap<FunctionProfile> Profiles; + + /// \brief Path name to the file holding the profile data. + /// + /// The format of this file is defined by each profiler + /// independently. If possible, the profiler should have a text + /// version of the profile format to be used in constructing test + /// cases and debugging. + StringRef Filename; +}; + +/// \brief Loader class for text-based profiles. +/// +/// This class defines a simple interface to read text files containing +/// profiles. It keeps track of line number information and location of +/// the file pointer. Users of this class are responsible for actually +/// parsing the lines returned by the readLine function. +/// +/// TODO - This does not really belong here. It is a generic text file +/// reader. It should be moved to the Support library and made more general. +class ExternalProfileTextLoader { +public: + ExternalProfileTextLoader(StringRef F) : Filename(F) { + error_code EC; + EC = MemoryBuffer::getFile(Filename, Buffer); + if (EC) + report_fatal_error("Could not open profile file " + Filename + ": " + + EC.message()); + FP = Buffer->getBufferStart(); + Lineno = 0; + } + + /// \brief Read a line from the mapped file. + StringRef readLine() { + size_t Length = 0; + const char *start = FP; + while (FP != Buffer->getBufferEnd() && *FP != '\n') { + Length++; + FP++; + } + if (FP != Buffer->getBufferEnd()) + FP++; + Lineno++; + return StringRef(start, Length); + } + + /// \brief Return true, if we've reached EOF. + bool atEOF() const { return FP == Buffer->getBufferEnd(); } + + /// \brief Report a parse error message and stop compilation. + void reportParseError(Twine Msg) const { + report_fatal_error(Filename + ":" + Twine(Lineno) + ": " + Msg + "\n"); + } + +private: + /// \brief Memory buffer holding the text file. + OwningPtr<MemoryBuffer> Buffer; + + /// \brief Current position into the memory buffer. + const char *FP; + + /// \brief Current line number. + int64_t Lineno; + + /// \brief Path name where to the profile file. + StringRef Filename; +}; + +/// \brief Sample profile pass. +/// +/// This pass reads profile data from the file specified by +/// -sample-profile-file and annotates every affected function with the +/// profile information found in that file. +class SampleProfileLoader : public FunctionPass { +public: + // Class identification, replacement for typeinfo + static char ID; + + SampleProfileLoader(StringRef Name = SampleProfileFile) + : FunctionPass(ID), Profiler(0), Filename(Name) { + initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry()); + } + + virtual bool doInitialization(Module &M); + + void dump() { Profiler->dump(); } + + virtual const char *getPassName() const { return "Sample profile pass"; } + + virtual bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + } + +protected: + /// \brief Profile reader object. + OwningPtr<SampleProfile> Profiler; + + /// \brief Name of the profile file to load. + StringRef Filename; +}; +} + +/// \brief Print the function profile for \p FName on stream \p OS. +/// +/// \param OS Stream to emit the output to. +/// \param FName Name of the function to print. +void SampleProfile::printFunctionProfile(raw_ostream &OS, StringRef FName) { + FunctionProfile FProfile = Profiles[FName]; + OS << "Function: " << FName << ", " << FProfile.TotalSamples << ", " + << FProfile.TotalHeadSamples << ", " << FProfile.BodySamples.size() + << " sampled lines\n"; + for (BodySampleMap::const_iterator SI = FProfile.BodySamples.begin(), + SE = FProfile.BodySamples.end(); + SI != SE; ++SI) + OS << "\tline offset: " << SI->first + << ", number of samples: " << SI->second << "\n"; + OS << "\n"; +} + +/// \brief Dump the function profile for \p FName. +/// +/// \param FName Name of the function to print. +void SampleProfile::dumpFunctionProfile(StringRef FName) { + printFunctionProfile(dbgs(), FName); +} + +/// \brief Dump all the function profiles found. +void SampleProfile::dump() { + for (StringMap<FunctionProfile>::const_iterator I = Profiles.begin(), + E = Profiles.end(); + I != E; ++I) + dumpFunctionProfile(I->getKey()); +} + +/// \brief Load samples from a text file. +/// +/// The file is divided in two segments: +/// +/// Symbol table (represented with the string "symbol table") +/// Number of symbols in the table +/// symbol 1 +/// symbol 2 +/// ... +/// symbol N +/// +/// Function body profiles +/// function1:total_samples:total_head_samples:number_of_locations +/// location_offset_1: number_of_samples +/// location_offset_2: number_of_samples +/// ... +/// location_offset_N: number_of_samples +/// +/// Function names must be mangled in order for the profile loader to +/// match them in the current translation unit. +/// +/// Since this is a flat profile, a function that shows up more than +/// once gets all its samples aggregated across all its instances. +/// TODO - flat profiles are too imprecise to provide good optimization +/// opportunities. Convert them to context-sensitive profile. +/// +/// This textual representation is useful to generate unit tests and +/// for debugging purposes, but it should not be used to generate +/// profiles for large programs, as the representation is extremely +/// inefficient. +void SampleProfile::loadText() { + ExternalProfileTextLoader Loader(Filename); + + // Read the symbol table. + StringRef Line = Loader.readLine(); + if (Line != "symbol table") + Loader.reportParseError("Expected 'symbol table', found " + Line); + int NumSymbols; + Line = Loader.readLine(); + if (Line.getAsInteger(10, NumSymbols)) + Loader.reportParseError("Expected a number, found " + Line); + for (int I = 0; I < NumSymbols; I++) { + StringRef FName = Loader.readLine(); + FunctionProfile &FProfile = Profiles[FName]; + FProfile.BodySamples.clear(); + FProfile.TotalSamples = 0; + FProfile.TotalHeadSamples = 0; + } + + // Read the profile of each function. Since each function may be + // mentioned more than once, and we are collecting flat profiles, + // accumulate samples as we parse them. + Regex HeadRE("^([^:]+):([0-9]+):([0-9]+):([0-9]+)$"); + Regex LineSample("^([0-9]+): ([0-9]+)$"); + while (!Loader.atEOF()) { + SmallVector<StringRef, 4> Matches; + Line = Loader.readLine(); + if (!HeadRE.match(Line, &Matches)) + Loader.reportParseError("Expected 'mangled_name:NUM:NUM:NUM', found " + + Line); + assert(Matches.size() == 5); + StringRef FName = Matches[1]; + unsigned NumSamples, NumHeadSamples, NumSampledLines; + Matches[2].getAsInteger(10, NumSamples); + Matches[3].getAsInteger(10, NumHeadSamples); + Matches[4].getAsInteger(10, NumSampledLines); + FunctionProfile &FProfile = Profiles[FName]; + FProfile.TotalSamples += NumSamples; + FProfile.TotalHeadSamples += NumHeadSamples; + BodySampleMap &SampleMap = FProfile.BodySamples; + unsigned I; + for (I = 0; I < NumSampledLines && !Loader.atEOF(); I++) { + Line = Loader.readLine(); + if (!LineSample.match(Line, &Matches)) + Loader.reportParseError("Expected 'NUM: NUM', found " + Line); + assert(Matches.size() == 3); + unsigned LineOffset, NumSamples; + Matches[1].getAsInteger(10, LineOffset); + Matches[2].getAsInteger(10, NumSamples); + SampleMap[LineOffset] += NumSamples; + } + + if (I < NumSampledLines) + Loader.reportParseError("Unexpected end of file"); + } +} + +/// \brief Get the weight for an instruction. +/// +/// The "weight" of an instruction \p Inst is the number of samples +/// collected on that instruction at runtime. To retrieve it, we +/// need to compute the line number of \p Inst relative to the start of its +/// function. We use \p FirstLineno to compute the offset. We then +/// look up the samples collected for \p Inst using \p BodySamples. +/// +/// \param Inst Instruction to query. +/// \param FirstLineno Line number of the first instruction in the function. +/// \param BodySamples Map of relative source line locations to samples. +/// +/// \returns The profiled weight of I. +uint32_t SampleProfile::getInstWeight(Instruction &Inst, unsigned FirstLineno, + BodySampleMap &BodySamples) { + unsigned LOffset = Inst.getDebugLoc().getLine() - FirstLineno + 1; + return BodySamples.lookup(LOffset); +} + +/// \brief Compute the weight of a basic block. +/// +/// The weight of basic block \p B is the maximum weight of all the +/// instructions in B. +/// +/// \param B The basic block to query. +/// \param FirstLineno The line number for the first line in the +/// function holding B. +/// \param BodySamples The map containing all the samples collected in that +/// function. +/// +/// \returns The computed weight of B. +uint32_t SampleProfile::computeBlockWeight(BasicBlock *B, unsigned FirstLineno, + BodySampleMap &BodySamples) { + // If we've computed B's weight before, return it. + Function *F = B->getParent(); + FunctionProfile &FProfile = Profiles[F->getName()]; + std::pair<BlockWeightMap::iterator, bool> Entry = + FProfile.BlockWeights.insert(std::make_pair(B, 0)); + if (!Entry.second) + return Entry.first->second; + + // Otherwise, compute and cache B's weight. + uint32_t Weight = 0; + for (BasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) { + uint32_t InstWeight = getInstWeight(*I, FirstLineno, BodySamples); + if (InstWeight > Weight) + Weight = InstWeight; + } + Entry.first->second = Weight; + return Weight; +} + +/// \brief Generate branch weight metadata for all branches in \p F. +/// +/// For every branch instruction B in \p F, we compute the weight of the +/// target block for each of the edges out of B. This is the weight +/// that we associate with that branch. +/// +/// TODO - This weight assignment will most likely be wrong if the +/// target branch has more than two predecessors. This needs to be done +/// using some form of flow propagation. +/// +/// Once all the branch weights are computed, we emit the MD_prof +/// metadata on B using the computed values. +/// +/// \param F The function to query. +bool SampleProfile::emitAnnotations(Function &F) { + bool Changed = false; + FunctionProfile &FProfile = Profiles[F.getName()]; + unsigned FirstLineno = inst_begin(F)->getDebugLoc().getLine(); + MDBuilder MDB(F.getContext()); + + // Clear the block weights cache. + FProfile.BlockWeights.clear(); + + // When we find a branch instruction: For each edge E out of the branch, + // the weight of E is the weight of the target block. + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { + BasicBlock *B = I; + TerminatorInst *TI = B->getTerminator(); + if (TI->getNumSuccessors() == 1) + continue; + if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI)) + continue; + + SmallVector<uint32_t, 4> Weights; + unsigned NSuccs = TI->getNumSuccessors(); + for (unsigned I = 0; I < NSuccs; ++I) { + BasicBlock *Succ = TI->getSuccessor(I); + uint32_t Weight = + computeBlockWeight(Succ, FirstLineno, FProfile.BodySamples); + Weights.push_back(Weight); + } + + TI->setMetadata(llvm::LLVMContext::MD_prof, + MDB.createBranchWeights(Weights)); + Changed = true; + } + + return Changed; +} + +char SampleProfileLoader::ID = 0; +INITIALIZE_PASS(SampleProfileLoader, "sample-profile", "Sample Profile loader", + false, false) + +bool SampleProfileLoader::runOnFunction(Function &F) { + return Profiler->emitAnnotations(F); +} + +bool SampleProfileLoader::doInitialization(Module &M) { + Profiler.reset(new SampleProfile(Filename)); + Profiler->loadText(); + return true; +} + +FunctionPass *llvm::createSampleProfileLoaderPass() { + return new SampleProfileLoader(SampleProfileFile); +} + +FunctionPass *llvm::createSampleProfileLoaderPass(StringRef Name) { + return new SampleProfileLoader(Name); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp index 8a9c7da..857597e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -28,7 +28,7 @@ using namespace llvm; /// ScalarOpts library. void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCEPass(Registry); - initializeBlockPlacementPass(Registry); + initializeSampleProfileLoaderPass(Registry); initializeCodeGenPreparePass(Registry); initializeConstantPropagationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); @@ -44,12 +44,14 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLoopInstSimplifyPass(Registry); initializeLoopRotatePass(Registry); initializeLoopStrengthReducePass(Registry); + initializeLoopRerollPass(Registry); initializeLoopUnrollPass(Registry); initializeLoopUnswitchPass(Registry); initializeLoopIdiomRecognizePass(Registry); initializeLowerAtomicPass(Registry); initializeLowerExpectIntrinsicPass(Registry); initializeMemCpyOptPass(Registry); + initializePartiallyInlineLibCallsPass(Registry); initializeReassociatePass(Registry); initializeRegToMemPass(Registry); initializeSCCPPass(Registry); @@ -58,7 +60,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeSROA_DTPass(Registry); initializeSROA_SSAUpPass(Registry); initializeCFGSimplifyPassPass(Registry); - initializeSimplifyLibCallsPass(Registry); + initializeStructurizeCFGPass(Registry); initializeSinkingPass(Registry); initializeTailCallElimPass(Registry); } @@ -111,6 +113,10 @@ void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopRotatePass()); } +void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopRerollPass()); +} + void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopUnrollPass()); } @@ -123,6 +129,10 @@ void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createMemCpyOptPass()); } +void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createPartiallyInlineLibCallsPass()); +} + void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createPromoteMemoryToRegisterPass()); } @@ -149,7 +159,7 @@ void LLVMAddScalarReplAggregatesPassWithThreshold(LLVMPassManagerRef PM, } void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createSimplifyLibCallsPass()); + // NOTE: The simplify-libcalls pass has been removed. } void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) { diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp index bfde334..57b290e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -166,21 +166,21 @@ namespace { void DeleteDeadInstructions(); void RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, - SmallVector<AllocaInst*, 32> &NewElts); + SmallVectorImpl<AllocaInst *> &NewElts); void RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, - SmallVector<AllocaInst*, 32> &NewElts); + SmallVectorImpl<AllocaInst *> &NewElts); void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, - SmallVector<AllocaInst*, 32> &NewElts); + SmallVectorImpl<AllocaInst *> &NewElts); void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, uint64_t Offset, - SmallVector<AllocaInst*, 32> &NewElts); + SmallVectorImpl<AllocaInst *> &NewElts); void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, AllocaInst *AI, - SmallVector<AllocaInst*, 32> &NewElts); + SmallVectorImpl<AllocaInst *> &NewElts); void RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, - SmallVector<AllocaInst*, 32> &NewElts); + SmallVectorImpl<AllocaInst *> &NewElts); void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, - SmallVector<AllocaInst*, 32> &NewElts); + SmallVectorImpl<AllocaInst *> &NewElts); bool ShouldAttemptScalarRepl(AllocaInst *AI); }; @@ -963,7 +963,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy()) SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth)); else if (SV->getType()->isPointerTy()) - SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext())); + SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getType())); // Zero extend or truncate the value if needed. if (SV->getType() != AllocaType) { @@ -1066,12 +1066,12 @@ public: LoadAndStorePromoter::run(Insts); AI->eraseFromParent(); - for (SmallVector<DbgDeclareInst *, 4>::iterator I = DDIs.begin(), + for (SmallVectorImpl<DbgDeclareInst *>::iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; DDI->eraseFromParent(); } - for (SmallVector<DbgValueInst *, 4>::iterator I = DVIs.begin(), + for (SmallVectorImpl<DbgValueInst *>::iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; DVI->eraseFromParent(); @@ -1086,7 +1086,7 @@ public: } virtual void updateDebugInfo(Instruction *Inst) const { - for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), + for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) @@ -1094,7 +1094,7 @@ public: else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) ConvertDebugDeclareToDebugValue(DDI, LI, *DIB); } - for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), + for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; Value *Arg = NULL; @@ -1865,7 +1865,7 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) { /// Offset indicates the position within AI that is referenced by this /// instruction. void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, - SmallVector<AllocaInst*, 32> &NewElts) { + SmallVectorImpl<AllocaInst *> &NewElts) { for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) { Use &TheUse = UI.getUse(); Instruction *User = cast<Instruction>(*UI++); @@ -1979,7 +1979,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, /// RewriteBitCast - Update a bitcast reference to the alloca being replaced /// and recursively continue updating all of its uses. void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, - SmallVector<AllocaInst*, 32> &NewElts) { + SmallVectorImpl<AllocaInst *> &NewElts) { RewriteForScalarRepl(BC, AI, Offset, NewElts); if (BC->getOperand(0) != AI) return; @@ -2037,7 +2037,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, /// elements of the alloca that are being split apart, and if so, rewrite /// the GEP to be relative to the new element. void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, - SmallVector<AllocaInst*, 32> &NewElts) { + SmallVectorImpl<AllocaInst *> &NewElts) { uint64_t OldOffset = Offset; SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); // If the GEP was dynamic then it must have been a dynamic vector lookup. @@ -2099,7 +2099,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, /// to mark the lifetime of the scalarized memory. void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, uint64_t Offset, - SmallVector<AllocaInst*, 32> &NewElts) { + SmallVectorImpl<AllocaInst *> &NewElts) { ConstantInt *OldSize = cast<ConstantInt>(II->getArgOperand(0)); // Put matching lifetime markers on everything from Offset up to // Offset+OldSize. @@ -2153,9 +2153,10 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, /// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI. /// Rewrite it to copy or set the elements of the scalarized memory. -void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, - AllocaInst *AI, - SmallVector<AllocaInst*, 32> &NewElts) { +void +SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, + AllocaInst *AI, + SmallVectorImpl<AllocaInst *> &NewElts) { // If this is a memcpy/memmove, construct the other pointer as the // appropriate type. The "Other" pointer is the pointer that goes to memory // that doesn't have anything to do with the alloca that we are promoting. For @@ -2189,7 +2190,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, if (OtherPtr == AI || OtherPtr == NewElts[0]) { // This code will run twice for a no-op memcpy -- once for each operand. // Put only one reference to MI on the DeadInsts list. - for (SmallVector<Value*, 32>::const_iterator I = DeadInsts.begin(), + for (SmallVectorImpl<Value *>::const_iterator I = DeadInsts.begin(), E = DeadInsts.end(); I != E; ++I) if (*I == MI) return; DeadInsts.push_back(MI); @@ -2326,8 +2327,9 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, /// RewriteStoreUserOfWholeAlloca - We found a store of an integer that /// overwrites the entire allocation. Extract out the pieces of the stored /// integer and store them individually. -void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, - SmallVector<AllocaInst*, 32> &NewElts){ +void +SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, + SmallVectorImpl<AllocaInst *> &NewElts) { // Extract each element out of the integer according to its structure offset // and store the element value to the individual alloca. Value *SrcVal = SI->getOperand(0); @@ -2440,8 +2442,9 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, /// RewriteLoadUserOfWholeAlloca - We found a load of the entire allocation to /// an integer. Load the individual pieces to form the aggregate value. -void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, - SmallVector<AllocaInst*, 32> &NewElts) { +void +SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, + SmallVectorImpl<AllocaInst *> &NewElts) { // Extract each element out of the NewElts according to its structure offset // and form the result value. Type *AllocaEltTy = AI->getAllocatedType(); diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index c243d34..8371f6d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -41,187 +41,31 @@ using namespace llvm; STATISTIC(NumSimpl, "Number of blocks simplified"); namespace { - struct CFGSimplifyPass : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - CFGSimplifyPass() : FunctionPass(ID) { - initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); - } - - virtual bool runOnFunction(Function &F); +struct CFGSimplifyPass : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + CFGSimplifyPass() : FunctionPass(ID) { + initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); + } + virtual bool runOnFunction(Function &F); - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<TargetTransformInfo>(); - } - }; + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetTransformInfo>(); + } +}; } char CFGSimplifyPass::ID = 0; -INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", - false, false) +INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, + false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", - false, false) +INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, + false) // Public interface to the CFGSimplification pass FunctionPass *llvm::createCFGSimplificationPass() { return new CFGSimplifyPass(); } -/// changeToUnreachable - Insert an unreachable instruction before the specified -/// instruction, making it and the rest of the code in the block dead. -static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) { - BasicBlock *BB = I->getParent(); - // Loop over all of the successors, removing BB's entry from any PHI - // nodes. - for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) - (*SI)->removePredecessor(BB); - - // Insert a call to llvm.trap right before this. This turns the undefined - // behavior into a hard fail instead of falling through into random code. - if (UseLLVMTrap) { - Function *TrapFn = - Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap); - CallInst *CallTrap = CallInst::Create(TrapFn, "", I); - CallTrap->setDebugLoc(I->getDebugLoc()); - } - new UnreachableInst(I->getContext(), I); - - // All instructions after this are dead. - BasicBlock::iterator BBI = I, BBE = BB->end(); - while (BBI != BBE) { - if (!BBI->use_empty()) - BBI->replaceAllUsesWith(UndefValue::get(BBI->getType())); - BB->getInstList().erase(BBI++); - } -} - -/// changeToCall - Convert the specified invoke into a normal call. -static void changeToCall(InvokeInst *II) { - SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); - CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II); - NewCall->takeName(II); - NewCall->setCallingConv(II->getCallingConv()); - NewCall->setAttributes(II->getAttributes()); - NewCall->setDebugLoc(II->getDebugLoc()); - II->replaceAllUsesWith(NewCall); - - // Follow the call by a branch to the normal destination. - BranchInst::Create(II->getNormalDest(), II); - - // Update PHI nodes in the unwind destination - II->getUnwindDest()->removePredecessor(II->getParent()); - II->eraseFromParent(); -} - -static bool markAliveBlocks(BasicBlock *BB, - SmallPtrSet<BasicBlock*, 128> &Reachable) { - - SmallVector<BasicBlock*, 128> Worklist; - Worklist.push_back(BB); - Reachable.insert(BB); - bool Changed = false; - do { - BB = Worklist.pop_back_val(); - - // Do a quick scan of the basic block, turning any obviously unreachable - // instructions into LLVM unreachable insts. The instruction combining pass - // canonicalizes unreachable insts into stores to null or undef. - for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){ - if (CallInst *CI = dyn_cast<CallInst>(BBI)) { - if (CI->doesNotReturn()) { - // If we found a call to a no-return function, insert an unreachable - // instruction after it. Make sure there isn't *already* one there - // though. - ++BBI; - if (!isa<UnreachableInst>(BBI)) { - // Don't insert a call to llvm.trap right before the unreachable. - changeToUnreachable(BBI, false); - Changed = true; - } - break; - } - } - - // Store to undef and store to null are undefined and used to signal that - // they should be changed to unreachable by passes that can't modify the - // CFG. - if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) { - // Don't touch volatile stores. - if (SI->isVolatile()) continue; - - Value *Ptr = SI->getOperand(1); - - if (isa<UndefValue>(Ptr) || - (isa<ConstantPointerNull>(Ptr) && - SI->getPointerAddressSpace() == 0)) { - changeToUnreachable(SI, true); - Changed = true; - break; - } - } - } - - // Turn invokes that call 'nounwind' functions into ordinary calls. - if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) { - Value *Callee = II->getCalledValue(); - if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { - changeToUnreachable(II, true); - Changed = true; - } else if (II->doesNotThrow()) { - if (II->use_empty() && II->onlyReadsMemory()) { - // jump to the normal destination branch. - BranchInst::Create(II->getNormalDest(), II); - II->getUnwindDest()->removePredecessor(II->getParent()); - II->eraseFromParent(); - } else - changeToCall(II); - Changed = true; - } - } - - Changed |= ConstantFoldTerminator(BB, true); - for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) - if (Reachable.insert(*SI)) - Worklist.push_back(*SI); - } while (!Worklist.empty()); - return Changed; -} - -/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even -/// if they are in a dead cycle. Return true if a change was made, false -/// otherwise. -static bool removeUnreachableBlocksFromFn(Function &F) { - SmallPtrSet<BasicBlock*, 128> Reachable; - bool Changed = markAliveBlocks(F.begin(), Reachable); - - // If there are unreachable blocks in the CFG... - if (Reachable.size() == F.size()) - return Changed; - - assert(Reachable.size() < F.size()); - NumSimpl += F.size()-Reachable.size(); - - // Loop over all of the basic blocks that are not reachable, dropping all of - // their internal references... - for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) { - if (Reachable.count(BB)) - continue; - - for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) - if (Reachable.count(*SI)) - (*SI)->removePredecessor(BB); - BB->dropAllReferences(); - } - - for (Function::iterator I = ++F.begin(); I != F.end();) - if (!Reachable.count(I)) - I = F.getBasicBlockList().erase(I); - else - ++I; - - return true; -} - /// mergeEmptyReturnBlocks - If we have more than one empty (other than phi /// node) return blocks, merge them together to promote recursive block merging. static bool mergeEmptyReturnBlocks(Function &F) { @@ -326,7 +170,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, bool CFGSimplifyPass::runOnFunction(Function &F) { const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); - bool EverChanged = removeUnreachableBlocksFromFn(F); + bool EverChanged = removeUnreachableBlocks(F); EverChanged |= mergeEmptyReturnBlocks(F); EverChanged |= iterativelySimplifyCFG(F, TTI, TD); @@ -334,16 +178,16 @@ bool CFGSimplifyPass::runOnFunction(Function &F) { if (!EverChanged) return false; // iterativelySimplifyCFG can (rarely) make some loops dead. If this happens, - // removeUnreachableBlocksFromFn is needed to nuke them, which means we should + // removeUnreachableBlocks is needed to nuke them, which means we should // iterate between the two optimizations. We structure the code like this to // avoid reruning iterativelySimplifyCFG if the second pass of - // removeUnreachableBlocksFromFn doesn't do anything. - if (!removeUnreachableBlocksFromFn(F)) + // removeUnreachableBlocks doesn't do anything. + if (!removeUnreachableBlocks(F)) return true; do { EverChanged = iterativelySimplifyCFG(F, TTI, TD); - EverChanged |= removeUnreachableBlocksFromFn(F); + EverChanged |= removeUnreachableBlocks(F); } while (EverChanged); return true; diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp deleted file mode 100644 index 3514e6c..0000000 --- a/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp +++ /dev/null @@ -1,247 +0,0 @@ -//===- SimplifyLibCalls.cpp - Optimize specific well-known library calls --===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements a simple pass that applies a variety of small -// optimizations for calls to specific well-known function calls (e.g. runtime -// library functions). Any optimization that takes the very simple form -// "replace call to library function with simpler code that provides the same -// result" belongs in this file. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "simplify-libcalls" -#include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/StringMap.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/Config/config.h" // FIXME: Shouldn't depend on host! -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/BuildLibCalls.h" -using namespace llvm; - - -//===----------------------------------------------------------------------===// -// Optimizer Base Class -//===----------------------------------------------------------------------===// - -/// This class is the abstract base class for the set of optimizations that -/// corresponds to one library call. -namespace { -class LibCallOptimization { -protected: - Function *Caller; - const DataLayout *TD; - const TargetLibraryInfo *TLI; - LLVMContext* Context; -public: - LibCallOptimization() { } - virtual ~LibCallOptimization() {} - - /// CallOptimizer - This pure virtual method is implemented by base classes to - /// do various optimizations. If this returns null then no transformation was - /// performed. If it returns CI, then it transformed the call and CI is to be - /// deleted. If it returns something else, replace CI with the new value and - /// delete CI. - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) - =0; - - Value *OptimizeCall(CallInst *CI, const DataLayout *TD, - const TargetLibraryInfo *TLI, IRBuilder<> &B) { - Caller = CI->getParent()->getParent(); - this->TD = TD; - this->TLI = TLI; - if (CI->getCalledFunction()) - Context = &CI->getCalledFunction()->getContext(); - - // We never change the calling convention. - if (CI->getCallingConv() != llvm::CallingConv::C) - return NULL; - - return CallOptimizer(CI->getCalledFunction(), CI, B); - } -}; -} // End anonymous namespace. - - -//===----------------------------------------------------------------------===// -// SimplifyLibCalls Pass Implementation -//===----------------------------------------------------------------------===// - -namespace { - /// This pass optimizes well known library functions from libc and libm. - /// - class SimplifyLibCalls : public FunctionPass { - TargetLibraryInfo *TLI; - - StringMap<LibCallOptimization*> Optimizations; - public: - static char ID; // Pass identification - SimplifyLibCalls() : FunctionPass(ID) { - initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); - } - void AddOpt(LibFunc::Func F, LibCallOptimization* Opt); - void AddOpt(LibFunc::Func F1, LibFunc::Func F2, LibCallOptimization* Opt); - - void InitOptimizations(); - bool runOnFunction(Function &F); - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<TargetLibraryInfo>(); - } - }; -} // end anonymous namespace. - -char SimplifyLibCalls::ID = 0; - -INITIALIZE_PASS_BEGIN(SimplifyLibCalls, "simplify-libcalls", - "Simplify well-known library calls", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) -INITIALIZE_PASS_END(SimplifyLibCalls, "simplify-libcalls", - "Simplify well-known library calls", false, false) - -// Public interface to the Simplify LibCalls pass. -FunctionPass *llvm::createSimplifyLibCallsPass() { - return new SimplifyLibCalls(); -} - -void SimplifyLibCalls::AddOpt(LibFunc::Func F, LibCallOptimization* Opt) { - if (TLI->has(F)) - Optimizations[TLI->getName(F)] = Opt; -} - -void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2, - LibCallOptimization* Opt) { - if (TLI->has(F1) && TLI->has(F2)) - Optimizations[TLI->getName(F1)] = Opt; -} - -/// Optimizations - Populate the Optimizations map with all the optimizations -/// we know. -void SimplifyLibCalls::InitOptimizations() { -} - - -/// runOnFunction - Top level algorithm. -/// -bool SimplifyLibCalls::runOnFunction(Function &F) { - TLI = &getAnalysis<TargetLibraryInfo>(); - - if (Optimizations.empty()) - InitOptimizations(); - - const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); - - IRBuilder<> Builder(F.getContext()); - - bool Changed = false; - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { - // Ignore non-calls. - CallInst *CI = dyn_cast<CallInst>(I++); - if (!CI || CI->hasFnAttr(Attribute::NoBuiltin)) continue; - - // Ignore indirect calls and calls to non-external functions. - Function *Callee = CI->getCalledFunction(); - if (Callee == 0 || !Callee->isDeclaration() || - !(Callee->hasExternalLinkage() || Callee->hasDLLImportLinkage())) - continue; - - // Ignore unknown calls. - LibCallOptimization *LCO = Optimizations.lookup(Callee->getName()); - if (!LCO) continue; - - // Set the builder to the instruction after the call. - Builder.SetInsertPoint(BB, I); - - // Use debug location of CI for all new instructions. - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - // Try to optimize this call. - Value *Result = LCO->OptimizeCall(CI, TD, TLI, Builder); - if (Result == 0) continue; - - DEBUG(dbgs() << "SimplifyLibCalls simplified: " << *CI; - dbgs() << " into: " << *Result << "\n"); - - // Something changed! - Changed = true; - - // Inspect the instruction after the call (which was potentially just - // added) next. - I = CI; ++I; - - if (CI != Result && !CI->use_empty()) { - CI->replaceAllUsesWith(Result); - if (!Result->hasName()) - Result->takeName(CI); - } - CI->eraseFromParent(); - } - } - return Changed; -} - -// TODO: -// Additional cases that we need to add to this file: -// -// cbrt: -// * cbrt(expN(X)) -> expN(x/3) -// * cbrt(sqrt(x)) -> pow(x,1/6) -// * cbrt(sqrt(x)) -> pow(x,1/9) -// -// exp, expf, expl: -// * exp(log(x)) -> x -// -// log, logf, logl: -// * log(exp(x)) -> x -// * log(x**y) -> y*log(x) -// * log(exp(y)) -> y*log(e) -// * log(exp2(y)) -> y*log(2) -// * log(exp10(y)) -> y*log(10) -// * log(sqrt(x)) -> 0.5*log(x) -// * log(pow(x,y)) -> y*log(x) -// -// lround, lroundf, lroundl: -// * lround(cnst) -> cnst' -// -// pow, powf, powl: -// * pow(exp(x),y) -> exp(x*y) -// * pow(sqrt(x),y) -> pow(x,y*0.5) -// * pow(pow(x,y),z)-> pow(x,y*z) -// -// round, roundf, roundl: -// * round(cnst) -> cnst' -// -// signbit: -// * signbit(cnst) -> cnst' -// * signbit(nncst) -> 0 (if pstv is a non-negative constant) -// -// sqrt, sqrtf, sqrtl: -// * sqrt(expN(x)) -> expN(x*0.5) -// * sqrt(Nroot(x)) -> pow(x,1/(2*N)) -// * sqrt(pow(x,y)) -> pow(|x|,y*0.5) -// -// strchr: -// * strchr(p, 0) -> strlen(p) -// tan, tanf, tanl: -// * tan(atan(x)) -> x -// -// trunc, truncf, truncl: -// * trunc(cnst) -> cnst' -// -// diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp new file mode 100644 index 0000000..5045ff8f --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -0,0 +1,906 @@ +//===-- StructurizeCFG.cpp ------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "structurizecfg" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/Analysis/RegionInfo.h" +#include "llvm/Analysis/RegionIterator.h" +#include "llvm/Analysis/RegionPass.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/PatternMatch.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +namespace { + +// Definition of the complex types used in this pass. + +typedef std::pair<BasicBlock *, Value *> BBValuePair; + +typedef SmallVector<RegionNode*, 8> RNVector; +typedef SmallVector<BasicBlock*, 8> BBVector; +typedef SmallVector<BranchInst*, 8> BranchVector; +typedef SmallVector<BBValuePair, 2> BBValueVector; + +typedef SmallPtrSet<BasicBlock *, 8> BBSet; + +typedef MapVector<PHINode *, BBValueVector> PhiMap; +typedef MapVector<BasicBlock *, BBVector> BB2BBVecMap; + +typedef DenseMap<DomTreeNode *, unsigned> DTN2UnsignedMap; +typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap; +typedef DenseMap<BasicBlock *, Value *> BBPredicates; +typedef DenseMap<BasicBlock *, BBPredicates> PredMap; +typedef DenseMap<BasicBlock *, BasicBlock*> BB2BBMap; + +// The name for newly created blocks. + +static const char *const FlowBlockName = "Flow"; + +/// @brief Find the nearest common dominator for multiple BasicBlocks +/// +/// Helper class for StructurizeCFG +/// TODO: Maybe move into common code +class NearestCommonDominator { + DominatorTree *DT; + + DTN2UnsignedMap IndexMap; + + BasicBlock *Result; + unsigned ResultIndex; + bool ExplicitMentioned; + +public: + /// \brief Start a new query + NearestCommonDominator(DominatorTree *DomTree) { + DT = DomTree; + Result = 0; + } + + /// \brief Add BB to the resulting dominator + void addBlock(BasicBlock *BB, bool Remember = true) { + DomTreeNode *Node = DT->getNode(BB); + + if (Result == 0) { + unsigned Numbering = 0; + for (;Node;Node = Node->getIDom()) + IndexMap[Node] = ++Numbering; + Result = BB; + ResultIndex = 1; + ExplicitMentioned = Remember; + return; + } + + for (;Node;Node = Node->getIDom()) + if (IndexMap.count(Node)) + break; + else + IndexMap[Node] = 0; + + assert(Node && "Dominator tree invalid!"); + + unsigned Numbering = IndexMap[Node]; + if (Numbering > ResultIndex) { + Result = Node->getBlock(); + ResultIndex = Numbering; + ExplicitMentioned = Remember && (Result == BB); + } else if (Numbering == ResultIndex) { + ExplicitMentioned |= Remember; + } + } + + /// \brief Is "Result" one of the BBs added with "Remember" = True? + bool wasResultExplicitMentioned() { + return ExplicitMentioned; + } + + /// \brief Get the query result + BasicBlock *getResult() { + return Result; + } +}; + +/// @brief Transforms the control flow graph on one single entry/exit region +/// at a time. +/// +/// After the transform all "If"/"Then"/"Else" style control flow looks like +/// this: +/// +/// \verbatim +/// 1 +/// || +/// | | +/// 2 | +/// | / +/// |/ +/// 3 +/// || Where: +/// | | 1 = "If" block, calculates the condition +/// 4 | 2 = "Then" subregion, runs if the condition is true +/// | / 3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow +/// |/ 4 = "Else" optional subregion, runs if the condition is false +/// 5 5 = "End" block, also rejoins the control flow +/// \endverbatim +/// +/// Control flow is expressed as a branch where the true exit goes into the +/// "Then"/"Else" region, while the false exit skips the region +/// The condition for the optional "Else" region is expressed as a PHI node. +/// The incomming values of the PHI node are true for the "If" edge and false +/// for the "Then" edge. +/// +/// Additionally to that even complicated loops look like this: +/// +/// \verbatim +/// 1 +/// || +/// | | +/// 2 ^ Where: +/// | / 1 = "Entry" block +/// |/ 2 = "Loop" optional subregion, with all exits at "Flow" block +/// 3 3 = "Flow" block, with back edge to entry block +/// | +/// \endverbatim +/// +/// The back edge of the "Flow" block is always on the false side of the branch +/// while the true side continues the general flow. So the loop condition +/// consist of a network of PHI nodes where the true incoming values expresses +/// breaks and the false values expresses continue states. +class StructurizeCFG : public RegionPass { + Type *Boolean; + ConstantInt *BoolTrue; + ConstantInt *BoolFalse; + UndefValue *BoolUndef; + + Function *Func; + Region *ParentRegion; + + DominatorTree *DT; + + RNVector Order; + BBSet Visited; + + BBPhiMap DeletedPhis; + BB2BBVecMap AddedPhis; + + PredMap Predicates; + BranchVector Conditions; + + BB2BBMap Loops; + PredMap LoopPreds; + BranchVector LoopConds; + + RegionNode *PrevNode; + + void orderNodes(); + + void analyzeLoops(RegionNode *N); + + Value *invert(Value *Condition); + + Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert); + + void gatherPredicates(RegionNode *N); + + void collectInfos(); + + void insertConditions(bool Loops); + + void delPhiValues(BasicBlock *From, BasicBlock *To); + + void addPhiValues(BasicBlock *From, BasicBlock *To); + + void setPhiValues(); + + void killTerminator(BasicBlock *BB); + + void changeExit(RegionNode *Node, BasicBlock *NewExit, + bool IncludeDominator); + + BasicBlock *getNextFlow(BasicBlock *Dominator); + + BasicBlock *needPrefix(bool NeedEmpty); + + BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed); + + void setPrevNode(BasicBlock *BB); + + bool dominatesPredicates(BasicBlock *BB, RegionNode *Node); + + bool isPredictableTrue(RegionNode *Node); + + void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd); + + void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd); + + void createFlow(); + + void rebuildSSA(); + +public: + static char ID; + + StructurizeCFG() : + RegionPass(ID) { + initializeStructurizeCFGPass(*PassRegistry::getPassRegistry()); + } + + using Pass::doInitialization; + virtual bool doInitialization(Region *R, RGPassManager &RGM); + + virtual bool runOnRegion(Region *R, RGPassManager &RGM); + + virtual const char *getPassName() const { + return "Structurize control flow"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequiredID(LowerSwitchID); + AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTree>(); + RegionPass::getAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +char StructurizeCFG::ID = 0; + +INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG", + false, false) +INITIALIZE_PASS_DEPENDENCY(LowerSwitch) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(RegionInfo) +INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG", + false, false) + +/// \brief Initialize the types and constants used in the pass +bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) { + LLVMContext &Context = R->getEntry()->getContext(); + + Boolean = Type::getInt1Ty(Context); + BoolTrue = ConstantInt::getTrue(Context); + BoolFalse = ConstantInt::getFalse(Context); + BoolUndef = UndefValue::get(Boolean); + + return false; +} + +/// \brief Build up the general order of nodes +void StructurizeCFG::orderNodes() { + scc_iterator<Region *> I = scc_begin(ParentRegion), + E = scc_end(ParentRegion); + for (Order.clear(); I != E; ++I) { + std::vector<RegionNode *> &Nodes = *I; + Order.append(Nodes.begin(), Nodes.end()); + } +} + +/// \brief Determine the end of the loops +void StructurizeCFG::analyzeLoops(RegionNode *N) { + if (N->isSubRegion()) { + // Test for exit as back edge + BasicBlock *Exit = N->getNodeAs<Region>()->getExit(); + if (Visited.count(Exit)) + Loops[Exit] = N->getEntry(); + + } else { + // Test for sucessors as back edge + BasicBlock *BB = N->getNodeAs<BasicBlock>(); + BranchInst *Term = cast<BranchInst>(BB->getTerminator()); + + for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { + BasicBlock *Succ = Term->getSuccessor(i); + + if (Visited.count(Succ)) + Loops[Succ] = BB; + } + } +} + +/// \brief Invert the given condition +Value *StructurizeCFG::invert(Value *Condition) { + // First: Check if it's a constant + if (Condition == BoolTrue) + return BoolFalse; + + if (Condition == BoolFalse) + return BoolTrue; + + if (Condition == BoolUndef) + return BoolUndef; + + // Second: If the condition is already inverted, return the original value + if (match(Condition, m_Not(m_Value(Condition)))) + return Condition; + + if (Instruction *Inst = dyn_cast<Instruction>(Condition)) { + // Third: Check all the users for an invert + BasicBlock *Parent = Inst->getParent(); + for (Value::use_iterator I = Condition->use_begin(), + E = Condition->use_end(); I != E; ++I) { + + Instruction *User = dyn_cast<Instruction>(*I); + if (!User || User->getParent() != Parent) + continue; + + if (match(*I, m_Not(m_Specific(Condition)))) + return *I; + } + + // Last option: Create a new instruction + return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator()); + } + + if (Argument *Arg = dyn_cast<Argument>(Condition)) { + BasicBlock &EntryBlock = Arg->getParent()->getEntryBlock(); + return BinaryOperator::CreateNot(Condition, + Arg->getName() + ".inv", + EntryBlock.getTerminator()); + } + + llvm_unreachable("Unhandled condition to invert"); +} + +/// \brief Build the condition for one edge +Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx, + bool Invert) { + Value *Cond = Invert ? BoolFalse : BoolTrue; + if (Term->isConditional()) { + Cond = Term->getCondition(); + + if (Idx != (unsigned)Invert) + Cond = invert(Cond); + } + return Cond; +} + +/// \brief Analyze the predecessors of each block and build up predicates +void StructurizeCFG::gatherPredicates(RegionNode *N) { + RegionInfo *RI = ParentRegion->getRegionInfo(); + BasicBlock *BB = N->getEntry(); + BBPredicates &Pred = Predicates[BB]; + BBPredicates &LPred = LoopPreds[BB]; + + for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); + PI != PE; ++PI) { + + // Ignore it if it's a branch from outside into our region entry + if (!ParentRegion->contains(*PI)) + continue; + + Region *R = RI->getRegionFor(*PI); + if (R == ParentRegion) { + + // It's a top level block in our region + BranchInst *Term = cast<BranchInst>((*PI)->getTerminator()); + for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { + BasicBlock *Succ = Term->getSuccessor(i); + if (Succ != BB) + continue; + + if (Visited.count(*PI)) { + // Normal forward edge + if (Term->isConditional()) { + // Try to treat it like an ELSE block + BasicBlock *Other = Term->getSuccessor(!i); + if (Visited.count(Other) && !Loops.count(Other) && + !Pred.count(Other) && !Pred.count(*PI)) { + + Pred[Other] = BoolFalse; + Pred[*PI] = BoolTrue; + continue; + } + } + Pred[*PI] = buildCondition(Term, i, false); + + } else { + // Back edge + LPred[*PI] = buildCondition(Term, i, true); + } + } + + } else { + + // It's an exit from a sub region + while(R->getParent() != ParentRegion) + R = R->getParent(); + + // Edge from inside a subregion to its entry, ignore it + if (R == N) + continue; + + BasicBlock *Entry = R->getEntry(); + if (Visited.count(Entry)) + Pred[Entry] = BoolTrue; + else + LPred[Entry] = BoolFalse; + } + } +} + +/// \brief Collect various loop and predicate infos +void StructurizeCFG::collectInfos() { + // Reset predicate + Predicates.clear(); + + // and loop infos + Loops.clear(); + LoopPreds.clear(); + + // Reset the visited nodes + Visited.clear(); + + for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend(); + OI != OE; ++OI) { + + // Analyze all the conditions leading to a node + gatherPredicates(*OI); + + // Remember that we've seen this node + Visited.insert((*OI)->getEntry()); + + // Find the last back edges + analyzeLoops(*OI); + } +} + +/// \brief Insert the missing branch conditions +void StructurizeCFG::insertConditions(bool Loops) { + BranchVector &Conds = Loops ? LoopConds : Conditions; + Value *Default = Loops ? BoolTrue : BoolFalse; + SSAUpdater PhiInserter; + + for (BranchVector::iterator I = Conds.begin(), + E = Conds.end(); I != E; ++I) { + + BranchInst *Term = *I; + assert(Term->isConditional()); + + BasicBlock *Parent = Term->getParent(); + BasicBlock *SuccTrue = Term->getSuccessor(0); + BasicBlock *SuccFalse = Term->getSuccessor(1); + + PhiInserter.Initialize(Boolean, ""); + PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default); + PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default); + + BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue]; + + NearestCommonDominator Dominator(DT); + Dominator.addBlock(Parent, false); + + Value *ParentValue = 0; + for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); + PI != PE; ++PI) { + + if (PI->first == Parent) { + ParentValue = PI->second; + break; + } + PhiInserter.AddAvailableValue(PI->first, PI->second); + Dominator.addBlock(PI->first); + } + + if (ParentValue) { + Term->setCondition(ParentValue); + } else { + if (!Dominator.wasResultExplicitMentioned()) + PhiInserter.AddAvailableValue(Dominator.getResult(), Default); + + Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent)); + } + } +} + +/// \brief Remove all PHI values coming from "From" into "To" and remember +/// them in DeletedPhis +void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) { + PhiMap &Map = DeletedPhis[To]; + for (BasicBlock::iterator I = To->begin(), E = To->end(); + I != E && isa<PHINode>(*I);) { + + PHINode &Phi = cast<PHINode>(*I++); + while (Phi.getBasicBlockIndex(From) != -1) { + Value *Deleted = Phi.removeIncomingValue(From, false); + Map[&Phi].push_back(std::make_pair(From, Deleted)); + } + } +} + +/// \brief Add a dummy PHI value as soon as we knew the new predecessor +void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) { + for (BasicBlock::iterator I = To->begin(), E = To->end(); + I != E && isa<PHINode>(*I);) { + + PHINode &Phi = cast<PHINode>(*I++); + Value *Undef = UndefValue::get(Phi.getType()); + Phi.addIncoming(Undef, From); + } + AddedPhis[To].push_back(From); +} + +/// \brief Add the real PHI value as soon as everything is set up +void StructurizeCFG::setPhiValues() { + SSAUpdater Updater; + for (BB2BBVecMap::iterator AI = AddedPhis.begin(), AE = AddedPhis.end(); + AI != AE; ++AI) { + + BasicBlock *To = AI->first; + BBVector &From = AI->second; + + if (!DeletedPhis.count(To)) + continue; + + PhiMap &Map = DeletedPhis[To]; + for (PhiMap::iterator PI = Map.begin(), PE = Map.end(); + PI != PE; ++PI) { + + PHINode *Phi = PI->first; + Value *Undef = UndefValue::get(Phi->getType()); + Updater.Initialize(Phi->getType(), ""); + Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); + Updater.AddAvailableValue(To, Undef); + + NearestCommonDominator Dominator(DT); + Dominator.addBlock(To, false); + for (BBValueVector::iterator VI = PI->second.begin(), + VE = PI->second.end(); VI != VE; ++VI) { + + Updater.AddAvailableValue(VI->first, VI->second); + Dominator.addBlock(VI->first); + } + + if (!Dominator.wasResultExplicitMentioned()) + Updater.AddAvailableValue(Dominator.getResult(), Undef); + + for (BBVector::iterator FI = From.begin(), FE = From.end(); + FI != FE; ++FI) { + + int Idx = Phi->getBasicBlockIndex(*FI); + assert(Idx != -1); + Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(*FI)); + } + } + + DeletedPhis.erase(To); + } + assert(DeletedPhis.empty()); +} + +/// \brief Remove phi values from all successors and then remove the terminator. +void StructurizeCFG::killTerminator(BasicBlock *BB) { + TerminatorInst *Term = BB->getTerminator(); + if (!Term) + return; + + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); + SI != SE; ++SI) { + + delPhiValues(BB, *SI); + } + + Term->eraseFromParent(); +} + +/// \brief Let node exit(s) point to NewExit +void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit, + bool IncludeDominator) { + if (Node->isSubRegion()) { + Region *SubRegion = Node->getNodeAs<Region>(); + BasicBlock *OldExit = SubRegion->getExit(); + BasicBlock *Dominator = 0; + + // Find all the edges from the sub region to the exit + for (pred_iterator I = pred_begin(OldExit), E = pred_end(OldExit); + I != E;) { + + BasicBlock *BB = *I++; + if (!SubRegion->contains(BB)) + continue; + + // Modify the edges to point to the new exit + delPhiValues(BB, OldExit); + BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit); + addPhiValues(BB, NewExit); + + // Find the new dominator (if requested) + if (IncludeDominator) { + if (!Dominator) + Dominator = BB; + else + Dominator = DT->findNearestCommonDominator(Dominator, BB); + } + } + + // Change the dominator (if requested) + if (Dominator) + DT->changeImmediateDominator(NewExit, Dominator); + + // Update the region info + SubRegion->replaceExit(NewExit); + + } else { + BasicBlock *BB = Node->getNodeAs<BasicBlock>(); + killTerminator(BB); + BranchInst::Create(NewExit, BB); + addPhiValues(BB, NewExit); + if (IncludeDominator) + DT->changeImmediateDominator(NewExit, BB); + } +} + +/// \brief Create a new flow node and update dominator tree and region info +BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) { + LLVMContext &Context = Func->getContext(); + BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() : + Order.back()->getEntry(); + BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName, + Func, Insert); + DT->addNewBlock(Flow, Dominator); + ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion); + return Flow; +} + +/// \brief Create a new or reuse the previous node as flow node +BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) { + BasicBlock *Entry = PrevNode->getEntry(); + + if (!PrevNode->isSubRegion()) { + killTerminator(Entry); + if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end()) + return Entry; + + } + + // create a new flow node + BasicBlock *Flow = getNextFlow(Entry); + + // and wire it up + changeExit(PrevNode, Flow, true); + PrevNode = ParentRegion->getBBNode(Flow); + return Flow; +} + +/// \brief Returns the region exit if possible, otherwise just a new flow node +BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow, + bool ExitUseAllowed) { + if (Order.empty() && ExitUseAllowed) { + BasicBlock *Exit = ParentRegion->getExit(); + DT->changeImmediateDominator(Exit, Flow); + addPhiValues(Flow, Exit); + return Exit; + } + return getNextFlow(Flow); +} + +/// \brief Set the previous node +void StructurizeCFG::setPrevNode(BasicBlock *BB) { + PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB) : 0; +} + +/// \brief Does BB dominate all the predicates of Node ? +bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) { + BBPredicates &Preds = Predicates[Node->getEntry()]; + for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); + PI != PE; ++PI) { + + if (!DT->dominates(BB, PI->first)) + return false; + } + return true; +} + +/// \brief Can we predict that this node will always be called? +bool StructurizeCFG::isPredictableTrue(RegionNode *Node) { + BBPredicates &Preds = Predicates[Node->getEntry()]; + bool Dominated = false; + + // Regionentry is always true + if (PrevNode == 0) + return true; + + for (BBPredicates::iterator I = Preds.begin(), E = Preds.end(); + I != E; ++I) { + + if (I->second != BoolTrue) + return false; + + if (!Dominated && DT->dominates(I->first, PrevNode->getEntry())) + Dominated = true; + } + + // TODO: The dominator check is too strict + return Dominated; +} + +/// Take one node from the order vector and wire it up +void StructurizeCFG::wireFlow(bool ExitUseAllowed, + BasicBlock *LoopEnd) { + RegionNode *Node = Order.pop_back_val(); + Visited.insert(Node->getEntry()); + + if (isPredictableTrue(Node)) { + // Just a linear flow + if (PrevNode) { + changeExit(PrevNode, Node->getEntry(), true); + } + PrevNode = Node; + + } else { + // Insert extra prefix node (or reuse last one) + BasicBlock *Flow = needPrefix(false); + + // Insert extra postfix node (or use exit instead) + BasicBlock *Entry = Node->getEntry(); + BasicBlock *Next = needPostfix(Flow, ExitUseAllowed); + + // let it point to entry and next block + Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow)); + addPhiValues(Flow, Entry); + DT->changeImmediateDominator(Entry, Flow); + + PrevNode = Node; + while (!Order.empty() && !Visited.count(LoopEnd) && + dominatesPredicates(Entry, Order.back())) { + handleLoops(false, LoopEnd); + } + + changeExit(PrevNode, Next, false); + setPrevNode(Next); + } +} + +void StructurizeCFG::handleLoops(bool ExitUseAllowed, + BasicBlock *LoopEnd) { + RegionNode *Node = Order.back(); + BasicBlock *LoopStart = Node->getEntry(); + + if (!Loops.count(LoopStart)) { + wireFlow(ExitUseAllowed, LoopEnd); + return; + } + + if (!isPredictableTrue(Node)) + LoopStart = needPrefix(true); + + LoopEnd = Loops[Node->getEntry()]; + wireFlow(false, LoopEnd); + while (!Visited.count(LoopEnd)) { + handleLoops(false, LoopEnd); + } + + // If the start of the loop is the entry block, we can't branch to it so + // insert a new dummy entry block. + Function *LoopFunc = LoopStart->getParent(); + if (LoopStart == &LoopFunc->getEntryBlock()) { + LoopStart->setName("entry.orig"); + + BasicBlock *NewEntry = + BasicBlock::Create(LoopStart->getContext(), + "entry", + LoopFunc, + LoopStart); + BranchInst::Create(LoopStart, NewEntry); + } + + // Create an extra loop end node + LoopEnd = needPrefix(false); + BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed); + LoopConds.push_back(BranchInst::Create(Next, LoopStart, + BoolUndef, LoopEnd)); + addPhiValues(LoopEnd, LoopStart); + setPrevNode(Next); +} + +/// After this function control flow looks like it should be, but +/// branches and PHI nodes only have undefined conditions. +void StructurizeCFG::createFlow() { + BasicBlock *Exit = ParentRegion->getExit(); + bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit); + + DeletedPhis.clear(); + AddedPhis.clear(); + Conditions.clear(); + LoopConds.clear(); + + PrevNode = 0; + Visited.clear(); + + while (!Order.empty()) { + handleLoops(EntryDominatesExit, 0); + } + + if (PrevNode) + changeExit(PrevNode, Exit, EntryDominatesExit); + else + assert(EntryDominatesExit); +} + +/// Handle a rare case where the disintegrated nodes instructions +/// no longer dominate all their uses. Not sure if this is really nessasary +void StructurizeCFG::rebuildSSA() { + SSAUpdater Updater; + for (Region::block_iterator I = ParentRegion->block_begin(), + E = ParentRegion->block_end(); + I != E; ++I) { + + BasicBlock *BB = *I; + for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); + II != IE; ++II) { + + bool Initialized = false; + for (Use *I = &II->use_begin().getUse(), *Next; I; I = Next) { + + Next = I->getNext(); + + Instruction *User = cast<Instruction>(I->getUser()); + if (User->getParent() == BB) { + continue; + + } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) { + if (UserPN->getIncomingBlock(*I) == BB) + continue; + } + + if (DT->dominates(II, User)) + continue; + + if (!Initialized) { + Value *Undef = UndefValue::get(II->getType()); + Updater.Initialize(II->getType(), ""); + Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); + Updater.AddAvailableValue(BB, II); + Initialized = true; + } + Updater.RewriteUseAfterInsertions(*I); + } + } + } +} + +/// \brief Run the transformation for each region found +bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { + if (R->isTopLevelRegion()) + return false; + + Func = R->getEntry()->getParent(); + ParentRegion = R; + + DT = &getAnalysis<DominatorTree>(); + + orderNodes(); + collectInfos(); + createFlow(); + insertConditions(false); + insertConditions(true); + setPhiValues(); + rebuildSSA(); + + // Cleanup + Order.clear(); + Visited.clear(); + DeletedPhis.clear(); + AddedPhis.clear(); + Predicates.clear(); + Conditions.clear(); + Loops.clear(); + LoopPreds.clear(); + LoopConds.clear(); + + return true; +} + +/// \brief Create the pass +Pass *llvm::createStructurizeCFGPass() { + return new StructurizeCFG(); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index 2002e68..9fb8ddc 100644 --- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -53,6 +53,7 @@ #define DEBUG_TYPE "tailcallelim" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/InlineCost.h" @@ -69,6 +70,7 @@ #include "llvm/Support/CFG.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" @@ -97,16 +99,16 @@ namespace { bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail, - SmallVector<PHINode*, 8> &ArgumentPHIs, + SmallVectorImpl<PHINode *> &ArgumentPHIs, bool CannotTailCallElimCallsMarkedTail); bool FoldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail, - SmallVector<PHINode*, 8> &ArgumentPHIs, + SmallVectorImpl<PHINode *> &ArgumentPHIs, bool CannotTailCallElimCallsMarkedTail); bool ProcessReturningBlock(ReturnInst *RI, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail, - SmallVector<PHINode*, 8> &ArgumentPHIs, + SmallVectorImpl<PHINode *> &ArgumentPHIs, bool CannotTailCallElimCallsMarkedTail); bool CanMoveAboveCall(Instruction *I, CallInst *CI); Value *CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI); @@ -129,34 +131,44 @@ void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<TargetTransformInfo>(); } -/// AllocaMightEscapeToCalls - Return true if this alloca may be accessed by -/// callees of this function. We only do very simple analysis right now, this -/// could be expanded in the future to use mod/ref information for particular -/// call sites if desired. -static bool AllocaMightEscapeToCalls(AllocaInst *AI) { - // FIXME: do simple 'address taken' analysis. - return true; +/// CanTRE - Scan the specified basic block for alloca instructions. +/// If it contains any that are variable-sized or not in the entry block, +/// returns false. +static bool CanTRE(AllocaInst *AI) { + // Because of PR962, we don't TRE allocas outside the entry block. + + // If this alloca is in the body of the function, or if it is a variable + // sized allocation, we cannot tail call eliminate calls marked 'tail' + // with this mechanism. + BasicBlock *BB = AI->getParent(); + return BB == &BB->getParent()->getEntryBlock() && + isa<ConstantInt>(AI->getArraySize()); } -/// CheckForEscapingAllocas - Scan the specified basic block for alloca -/// instructions. If it contains any that might be accessed by calls, return -/// true. -static bool CheckForEscapingAllocas(BasicBlock *BB, - bool &CannotTCETailMarkedCall) { - bool RetVal = false; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) - if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { - RetVal |= AllocaMightEscapeToCalls(AI); - - // If this alloca is in the body of the function, or if it is a variable - // sized allocation, we cannot tail call eliminate calls marked 'tail' - // with this mechanism. - if (BB != &BB->getParent()->getEntryBlock() || - !isa<ConstantInt>(AI->getArraySize())) - CannotTCETailMarkedCall = true; - } - return RetVal; -} +namespace { +struct AllocaCaptureTracker : public CaptureTracker { + AllocaCaptureTracker() : Captured(false) {} + + void tooManyUses() LLVM_OVERRIDE { Captured = true; } + + bool shouldExplore(Use *U) LLVM_OVERRIDE { + Value *V = U->getUser(); + if (isa<CallInst>(V) || isa<InvokeInst>(V)) + UsesAlloca.insert(V); + return true; + } + + bool captured(Use *U) LLVM_OVERRIDE { + if (isa<ReturnInst>(U->getUser())) + return false; + Captured = true; + return true; + } + + bool Captured; + SmallPtrSet<const Value *, 16> UsesAlloca; +}; +} // end anonymous namespace bool TailCallElim::runOnFunction(Function &F) { // If this function is a varargs function, we won't be able to PHI the args @@ -168,41 +180,44 @@ bool TailCallElim::runOnFunction(Function &F) { bool TailCallsAreMarkedTail = false; SmallVector<PHINode*, 8> ArgumentPHIs; bool MadeChange = false; - bool FunctionContainsEscapingAllocas = false; - // CannotTCETailMarkedCall - If true, we cannot perform TCE on tail calls + // CanTRETailMarkedCall - If false, we cannot perform TRE on tail calls // marked with the 'tail' attribute, because doing so would cause the stack - // size to increase (real TCE would deallocate variable sized allocas, TCE + // size to increase (real TRE would deallocate variable sized allocas, TRE // doesn't). - bool CannotTCETailMarkedCall = false; - - // Loop over the function, looking for any returning blocks, and keeping track - // of whether this function has any non-trivially used allocas. - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (FunctionContainsEscapingAllocas && CannotTCETailMarkedCall) - break; - - FunctionContainsEscapingAllocas |= - CheckForEscapingAllocas(BB, CannotTCETailMarkedCall); + bool CanTRETailMarkedCall = true; + + // Find calls that can be marked tail. + AllocaCaptureTracker ACT; + for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { + CanTRETailMarkedCall &= CanTRE(AI); + PointerMayBeCaptured(AI, &ACT); + // If any allocas are captured, exit. + if (ACT.Captured) + return false; + } + } } - /// FIXME: The code generator produces really bad code when an 'escaping - /// alloca' is changed from being a static alloca to being a dynamic alloca. - /// Until this is resolved, disable this transformation if that would ever - /// happen. This bug is PR962. - if (FunctionContainsEscapingAllocas) - return false; - - // Second pass, change any tail calls to loops. - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { - bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs,CannotTCETailMarkedCall); - if (!Change && BB->getFirstNonPHIOrDbg() == Ret) - Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, - TailCallsAreMarkedTail, ArgumentPHIs, - CannotTCETailMarkedCall); - MadeChange |= Change; + // Second pass, change any tail recursive calls to loops. + // + // FIXME: The code generator produces really bad code when an 'escaping + // alloca' is changed from being a static alloca to being a dynamic alloca. + // Until this is resolved, disable this transformation if that would ever + // happen. This bug is PR962. + if (ACT.UsesAlloca.empty()) { + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { + bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, + ArgumentPHIs, !CanTRETailMarkedCall); + if (!Change && BB->getFirstNonPHIOrDbg() == Ret) + Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, + TailCallsAreMarkedTail, ArgumentPHIs, + !CanTRETailMarkedCall); + MadeChange |= Change; + } } } @@ -223,16 +238,24 @@ bool TailCallElim::runOnFunction(Function &F) { } } - // Finally, if this function contains no non-escaping allocas, or calls - // setjmp, mark all calls in the function as eligible for tail calls - //(there is no stack memory for them to access). - if (!FunctionContainsEscapingAllocas && !F.callsFunctionThatReturnsTwice()) - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) + // At this point, we know that the function does not have any captured + // allocas. If additionally the function does not call setjmp, mark all calls + // in the function that do not access stack memory with the tail keyword. This + // implies ensuring that there does not exist any path from a call that takes + // in an alloca but does not capture it and the call which we wish to mark + // with "tail". + if (!F.callsFunctionThatReturnsTwice()) { + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { if (CallInst *CI = dyn_cast<CallInst>(I)) { - CI->setTailCall(); - MadeChange = true; + if (!ACT.UsesAlloca.count(CI)) { + CI->setTailCall(); + MadeChange = true; + } } + } + } + } return MadeChange; } @@ -424,7 +447,7 @@ TailCallElim::FindTRECandidate(Instruction *TI, bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail, - SmallVector<PHINode*, 8> &ArgumentPHIs, + SmallVectorImpl<PHINode *> &ArgumentPHIs, bool CannotTailCallElimCallsMarkedTail) { // If we are introducing accumulator recursion to eliminate operations after // the call instruction that are both associative and commutative, the initial @@ -600,7 +623,7 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail, - SmallVector<PHINode*, 8> &ArgumentPHIs, + SmallVectorImpl<PHINode *> &ArgumentPHIs, bool CannotTailCallElimCallsMarkedTail) { bool Change = false; @@ -634,10 +657,11 @@ bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB, return Change; } -bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, - bool &TailCallsAreMarkedTail, - SmallVector<PHINode*, 8> &ArgumentPHIs, - bool CannotTailCallElimCallsMarkedTail) { +bool +TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVectorImpl<PHINode *> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail) { CallInst *CI = FindTRECandidate(Ret, CannotTailCallElimCallsMarkedTail); if (!CI) return false; diff --git a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index ba99d2e..12de9ee 100644 --- a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -14,6 +14,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" @@ -170,7 +171,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) { if (DomTreeNode *DTN = DT->getNode(BB)) { DomTreeNode *PredDTN = DT->getNode(PredBB); SmallVector<DomTreeNode*, 8> Children(DTN->begin(), DTN->end()); - for (SmallVector<DomTreeNode*, 8>::iterator DI = Children.begin(), + for (SmallVectorImpl<DomTreeNode *>::iterator DI = Children.begin(), DE = Children.end(); DI != DE; ++DI) DT->changeImmediateDominator(*DI, PredDTN); @@ -235,22 +236,6 @@ void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) { ReplaceInstWithInst(From->getParent()->getInstList(), BI, To); } -/// GetSuccessorNumber - Search for the specified successor of basic block BB -/// and return its position in the terminator instruction's list of -/// successors. It is an error to call this with a block that is not a -/// successor. -unsigned llvm::GetSuccessorNumber(BasicBlock *BB, BasicBlock *Succ) { - TerminatorInst *Term = BB->getTerminator(); -#ifndef NDEBUG - unsigned e = Term->getNumSuccessors(); -#endif - for (unsigned i = 0; ; ++i) { - assert(i != e && "Didn't find edge?"); - if (Term->getSuccessor(i) == Succ) - return i; - } -} - /// SplitEdge - Split the edge connecting specified block. Pass P must /// not be NULL. BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) { @@ -263,7 +248,6 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) { // If the edge isn't critical, then BB has a single successor or Succ has a // single pred. Split the block. - BasicBlock::iterator SplitPoint; if (BasicBlock *SP = Succ->getSinglePredecessor()) { // If the successor only has a single pred, split the top of the successor // block. @@ -416,8 +400,12 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB, // If all incoming values for the new PHI would be the same, just don't // make a new PHI. Instead, just remove the incoming values from the old // PHI. - for (unsigned i = 0, e = Preds.size(); i != e; ++i) - PN->removeIncomingValue(Preds[i], false); + for (unsigned i = 0, e = Preds.size(); i != e; ++i) { + // Explicitly check the BB index here to handle duplicates in Preds. + int Idx = PN->getBasicBlockIndex(Preds[i]); + if (Idx >= 0) + PN->removeIncomingValue(Idx, false); + } } else { // If the values coming into the block are not the same, we need a PHI. // Create the new PHI node, insert it into NewBB at the end of the block @@ -598,52 +586,6 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB, } } -/// FindFunctionBackedges - Analyze the specified function to find all of the -/// loop backedges in the function and return them. This is a relatively cheap -/// (compared to computing dominators and loop info) analysis. -/// -/// The output is added to Result, as pairs of <from,to> edge info. -void llvm::FindFunctionBackedges(const Function &F, - SmallVectorImpl<std::pair<const BasicBlock*,const BasicBlock*> > &Result) { - const BasicBlock *BB = &F.getEntryBlock(); - if (succ_begin(BB) == succ_end(BB)) - return; - - SmallPtrSet<const BasicBlock*, 8> Visited; - SmallVector<std::pair<const BasicBlock*, succ_const_iterator>, 8> VisitStack; - SmallPtrSet<const BasicBlock*, 8> InStack; - - Visited.insert(BB); - VisitStack.push_back(std::make_pair(BB, succ_begin(BB))); - InStack.insert(BB); - do { - std::pair<const BasicBlock*, succ_const_iterator> &Top = VisitStack.back(); - const BasicBlock *ParentBB = Top.first; - succ_const_iterator &I = Top.second; - - bool FoundNew = false; - while (I != succ_end(ParentBB)) { - BB = *I++; - if (Visited.insert(BB)) { - FoundNew = true; - break; - } - // Successor is in VisitStack, it's a back edge. - if (InStack.count(BB)) - Result.push_back(std::make_pair(ParentBB, BB)); - } - - if (FoundNew) { - // Go down one level if there is a unvisited successor. - InStack.insert(BB); - VisitStack.push_back(std::make_pair(BB, succ_begin(BB))); - } else { - // Go up one level. - InStack.erase(VisitStack.pop_back_val().first); - } - } while (!VisitStack.empty()); -} - /// FoldReturnIntoUncondBranch - This method duplicates the specified return /// instruction into a predecessor which ends in an unconditional branch. If /// the return instruction returns a value defined by a PHI, propagate the @@ -726,3 +668,104 @@ TerminatorInst *llvm::SplitBlockAndInsertIfThen(Instruction *Cmp, ReplaceInstWithInst(HeadOldTerm, HeadNewTerm); return CheckTerm; } + +/// GetIfCondition - Given a basic block (BB) with two predecessors, +/// check to see if the merge at this block is due +/// to an "if condition". If so, return the boolean condition that determines +/// which entry into BB will be taken. Also, return by references the block +/// that will be entered from if the condition is true, and the block that will +/// be entered if the condition is false. +/// +/// This does no checking to see if the true/false blocks have large or unsavory +/// instructions in them. +Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue, + BasicBlock *&IfFalse) { + PHINode *SomePHI = dyn_cast<PHINode>(BB->begin()); + BasicBlock *Pred1 = NULL; + BasicBlock *Pred2 = NULL; + + if (SomePHI) { + if (SomePHI->getNumIncomingValues() != 2) + return NULL; + Pred1 = SomePHI->getIncomingBlock(0); + Pred2 = SomePHI->getIncomingBlock(1); + } else { + pred_iterator PI = pred_begin(BB), PE = pred_end(BB); + if (PI == PE) // No predecessor + return NULL; + Pred1 = *PI++; + if (PI == PE) // Only one predecessor + return NULL; + Pred2 = *PI++; + if (PI != PE) // More than two predecessors + return NULL; + } + + // We can only handle branches. Other control flow will be lowered to + // branches if possible anyway. + BranchInst *Pred1Br = dyn_cast<BranchInst>(Pred1->getTerminator()); + BranchInst *Pred2Br = dyn_cast<BranchInst>(Pred2->getTerminator()); + if (Pred1Br == 0 || Pred2Br == 0) + return 0; + + // Eliminate code duplication by ensuring that Pred1Br is conditional if + // either are. + if (Pred2Br->isConditional()) { + // If both branches are conditional, we don't have an "if statement". In + // reality, we could transform this case, but since the condition will be + // required anyway, we stand no chance of eliminating it, so the xform is + // probably not profitable. + if (Pred1Br->isConditional()) + return 0; + + std::swap(Pred1, Pred2); + std::swap(Pred1Br, Pred2Br); + } + + if (Pred1Br->isConditional()) { + // The only thing we have to watch out for here is to make sure that Pred2 + // doesn't have incoming edges from other blocks. If it does, the condition + // doesn't dominate BB. + if (Pred2->getSinglePredecessor() == 0) + return 0; + + // If we found a conditional branch predecessor, make sure that it branches + // to BB and Pred2Br. If it doesn't, this isn't an "if statement". + if (Pred1Br->getSuccessor(0) == BB && + Pred1Br->getSuccessor(1) == Pred2) { + IfTrue = Pred1; + IfFalse = Pred2; + } else if (Pred1Br->getSuccessor(0) == Pred2 && + Pred1Br->getSuccessor(1) == BB) { + IfTrue = Pred2; + IfFalse = Pred1; + } else { + // We know that one arm of the conditional goes to BB, so the other must + // go somewhere unrelated, and this must not be an "if statement". + return 0; + } + + return Pred1Br->getCondition(); + } + + // Ok, if we got here, both predecessors end with an unconditional branch to + // BB. Don't panic! If both blocks only have a single (identical) + // predecessor, and THAT is a conditional branch, then we're all ok! + BasicBlock *CommonPred = Pred1->getSinglePredecessor(); + if (CommonPred == 0 || CommonPred != Pred2->getSinglePredecessor()) + return 0; + + // Otherwise, if this is a conditional branch, then we can use it! + BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator()); + if (BI == 0) return 0; + + assert(BI->isConditional() && "Two successors but not conditional?"); + if (BI->getSuccessor(0) == Pred1) { + IfTrue = Pred1; + IfFalse = Pred2; + } else { + IfTrue = Pred2; + IfFalse = Pred1; + } + return BI->getCondition(); +} diff --git a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index 8513772..0e7f7f7 100644 --- a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -19,9 +19,9 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/ProfileInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" @@ -44,7 +44,6 @@ namespace { virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<DominatorTree>(); AU.addPreserved<LoopInfo>(); - AU.addPreserved<ProfileInfo>(); // No loop canonicalization guarantees are broken by this pass. AU.addPreservedID(LoopSimplifyID); @@ -84,39 +83,6 @@ bool BreakCriticalEdges::runOnFunction(Function &F) { // Implementation of the external critical edge manipulation functions //===----------------------------------------------------------------------===// -// isCriticalEdge - Return true if the specified edge is a critical edge. -// Critical edges are edges from a block with multiple successors to a block -// with multiple predecessors. -// -bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum, - bool AllowIdenticalEdges) { - assert(SuccNum < TI->getNumSuccessors() && "Illegal edge specification!"); - if (TI->getNumSuccessors() == 1) return false; - - const BasicBlock *Dest = TI->getSuccessor(SuccNum); - const_pred_iterator I = pred_begin(Dest), E = pred_end(Dest); - - // If there is more than one predecessor, this is a critical edge... - assert(I != E && "No preds, but we have an edge to the block?"); - const BasicBlock *FirstPred = *I; - ++I; // Skip one edge due to the incoming arc from TI. - if (!AllowIdenticalEdges) - return I != E; - - // If AllowIdenticalEdges is true, then we allow this edge to be considered - // non-critical iff all preds come from TI's block. - while (I != E) { - const BasicBlock *P = *I; - if (P != FirstPred) - return true; - // Note: leave this as is until no one ever compiles with either gcc 4.0.1 - // or Xcode 2. This seems to work around the pred_iterator assert in PR 2207 - E = pred_end(P); - ++I; - } - return false; -} - /// createPHIsForSplitLoopExit - When a loop exit edge is split, LCSSA form /// may require new PHIs in the new exit block. This function inserts the /// new PHIs, as needed. Preds is a list of preds inside the loop, SplitBB @@ -245,10 +211,9 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>(); LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>(); - ProfileInfo *PI = P->getAnalysisIfAvailable<ProfileInfo>(); // If we have nothing to update, just return. - if (DT == 0 && LI == 0 && PI == 0) + if (DT == 0 && LI == 0) return NewBB; // Now update analysis information. Since the only predecessor of NewBB is @@ -401,9 +366,5 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, } } - // Update ProfileInfo if it is around. - if (PI) - PI->splitEdge(TIBB, DestBB, NewBB, MergeIdenticalEdges); - return NewBB; } diff --git a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp index be8d39e..d105f5e 100644 --- a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -78,7 +78,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, bool ModuleLevelChanges, SmallVectorImpl<ReturnInst*> &Returns, const char *NameSuffix, ClonedCodeInfo *CodeInfo, - ValueMapTypeRemapper *TypeMapper) { + ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { assert(NameSuffix && "NameSuffix cannot be null!"); #ifndef NDEBUG @@ -147,7 +148,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, for (BasicBlock::iterator II = BB->begin(); II != BB->end(); ++II) RemapInstruction(II, VMap, ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, - TypeMapper); + TypeMapper, Materializer); } /// CloneFunction - Return a copy of the specified function, but without diff --git a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp index f7c659f..6f008644 100644 --- a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -277,8 +277,8 @@ void CodeExtractor::splitReturnBlocks() { DomTreeNode *NewNode = DT->addNewBlock(New, *I); - for (SmallVector<DomTreeNode*, 8>::iterator I = Children.begin(), - E = Children.end(); I != E; ++I) + for (SmallVectorImpl<DomTreeNode *>::iterator I = Children.begin(), + E = Children.end(); I != E; ++I) DT->changeImmediateDominator(*I, NewNode); } } @@ -665,8 +665,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, TheSwitch->setCondition(call); TheSwitch->setDefaultDest(TheSwitch->getSuccessor(NumExitBlocks)); // Remove redundant case - SwitchInst::CaseIt ToBeRemoved(TheSwitch, NumExitBlocks-1); - TheSwitch->removeCase(ToBeRemoved); + TheSwitch->removeCase(SwitchInst::CaseIt(TheSwitch, NumExitBlocks-1)); break; } } diff --git a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp index db525cd..0723b35 100644 --- a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp +++ b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp @@ -10,6 +10,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/CFG.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" diff --git a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp new file mode 100644 index 0000000..1da226b --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp @@ -0,0 +1,486 @@ +//===- FlatternCFG.cpp - Code to perform CFG flattening ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Reduce conditional branches in CFG. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "flattencfg" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +using namespace llvm; + +namespace { +class FlattenCFGOpt { + AliasAnalysis *AA; + /// \brief Use parallel-and or parallel-or to generate conditions for + /// conditional branches. + bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, Pass *P = 0); + /// \brief If \param BB is the merge block of an if-region, attempt to merge + /// the if-region with an adjacent if-region upstream if two if-regions + /// contain identical instructions. + bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, Pass *P = 0); + /// \brief Compare a pair of blocks: \p Block1 and \p Block2, which + /// are from two if-regions whose entry blocks are \p Head1 and \p + /// Head2. \returns true if \p Block1 and \p Block2 contain identical + /// instructions, and have no memory reference alias with \p Head2. + /// This is used as a legality check for merging if-regions. + bool CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, + BasicBlock *Block1, BasicBlock *Block2); + +public: + FlattenCFGOpt(AliasAnalysis *AA) : AA(AA) {} + bool run(BasicBlock *BB); +}; +} + +/// If \param [in] BB has more than one predecessor that is a conditional +/// branch, attempt to use parallel and/or for the branch condition. \returns +/// true on success. +/// +/// Before: +/// ...... +/// %cmp10 = fcmp une float %tmp1, %tmp2 +/// br i1 %cmp1, label %if.then, label %lor.rhs +/// +/// lor.rhs: +/// ...... +/// %cmp11 = fcmp une float %tmp3, %tmp4 +/// br i1 %cmp11, label %if.then, label %ifend +/// +/// if.end: // the merge block +/// ...... +/// +/// if.then: // has two predecessors, both of them contains conditional branch. +/// ...... +/// br label %if.end; +/// +/// After: +/// ...... +/// %cmp10 = fcmp une float %tmp1, %tmp2 +/// ...... +/// %cmp11 = fcmp une float %tmp3, %tmp4 +/// %cmp12 = or i1 %cmp10, %cmp11 // parallel-or mode. +/// br i1 %cmp12, label %if.then, label %ifend +/// +/// if.end: +/// ...... +/// +/// if.then: +/// ...... +/// br label %if.end; +/// +/// Current implementation handles two cases. +/// Case 1: \param BB is on the else-path. +/// +/// BB1 +/// / | +/// BB2 | +/// / \ | +/// BB3 \ | where, BB1, BB2 contain conditional branches. +/// \ | / BB3 contains unconditional branch. +/// \ | / BB4 corresponds to \param BB which is also the merge. +/// BB => BB4 +/// +/// +/// Corresponding source code: +/// +/// if (a == b && c == d) +/// statement; // BB3 +/// +/// Case 2: \param BB BB is on the then-path. +/// +/// BB1 +/// / | +/// | BB2 +/// \ / | where BB1, BB2 contain conditional branches. +/// BB => BB3 | BB3 contains unconditiona branch and corresponds +/// \ / to \param BB. BB4 is the merge. +/// BB4 +/// +/// Corresponding source code: +/// +/// if (a == b || c == d) +/// statement; // BB3 +/// +/// In both cases, \param BB is the common successor of conditional branches. +/// In Case 1, \param BB (BB4) has an unconditional branch (BB3) as +/// its predecessor. In Case 2, \param BB (BB3) only has conditional branches +/// as its predecessors. +/// +bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, + Pass *P) { + PHINode *PHI = dyn_cast<PHINode>(BB->begin()); + if (PHI) + return false; // For simplicity, avoid cases containing PHI nodes. + + BasicBlock *LastCondBlock = NULL; + BasicBlock *FirstCondBlock = NULL; + BasicBlock *UnCondBlock = NULL; + int Idx = -1; + + // Check predecessors of \param BB. + SmallPtrSet<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB)); + for (SmallPtrSetIterator<BasicBlock *> PI = Preds.begin(), PE = Preds.end(); + PI != PE; ++PI) { + BasicBlock *Pred = *PI; + BranchInst *PBI = dyn_cast<BranchInst>(Pred->getTerminator()); + + // All predecessors should terminate with a branch. + if (!PBI) + return false; + + BasicBlock *PP = Pred->getSinglePredecessor(); + + if (PBI->isUnconditional()) { + // Case 1: Pred (BB3) is an unconditional block, it should + // have a single predecessor (BB2) that is also a predecessor + // of \param BB (BB4) and should not have address-taken. + // There should exist only one such unconditional + // branch among the predecessors. + if (UnCondBlock || !PP || (Preds.count(PP) == 0) || + Pred->hasAddressTaken()) + return false; + + UnCondBlock = Pred; + continue; + } + + // Only conditional branches are allowed beyond this point. + assert(PBI->isConditional()); + + // Condition's unique use should be the branch instruction. + Value *PC = PBI->getCondition(); + if (!PC || !PC->hasOneUse()) + return false; + + if (PP && Preds.count(PP)) { + // These are internal condition blocks to be merged from, e.g., + // BB2 in both cases. + // Should not be address-taken. + if (Pred->hasAddressTaken()) + return false; + + // Instructions in the internal condition blocks should be safe + // to hoist up. + for (BasicBlock::iterator BI = Pred->begin(), BE = PBI; BI != BE;) { + Instruction *CI = BI++; + if (isa<PHINode>(CI) || !isSafeToSpeculativelyExecute(CI)) + return false; + } + } else { + // This is the condition block to be merged into, e.g. BB1 in + // both cases. + if (FirstCondBlock) + return false; + FirstCondBlock = Pred; + } + + // Find whether BB is uniformly on the true (or false) path + // for all of its predecessors. + BasicBlock *PS1 = PBI->getSuccessor(0); + BasicBlock *PS2 = PBI->getSuccessor(1); + BasicBlock *PS = (PS1 == BB) ? PS2 : PS1; + int CIdx = (PS1 == BB) ? 0 : 1; + + if (Idx == -1) + Idx = CIdx; + else if (CIdx != Idx) + return false; + + // PS is the successor which is not BB. Check successors to identify + // the last conditional branch. + if (Preds.count(PS) == 0) { + // Case 2. + LastCondBlock = Pred; + } else { + // Case 1 + BranchInst *BPS = dyn_cast<BranchInst>(PS->getTerminator()); + if (BPS && BPS->isUnconditional()) { + // Case 1: PS(BB3) should be an unconditional branch. + LastCondBlock = Pred; + } + } + } + + if (!FirstCondBlock || !LastCondBlock || (FirstCondBlock == LastCondBlock)) + return false; + + TerminatorInst *TBB = LastCondBlock->getTerminator(); + BasicBlock *PS1 = TBB->getSuccessor(0); + BasicBlock *PS2 = TBB->getSuccessor(1); + BranchInst *PBI1 = dyn_cast<BranchInst>(PS1->getTerminator()); + BranchInst *PBI2 = dyn_cast<BranchInst>(PS2->getTerminator()); + + // If PS1 does not jump into PS2, but PS2 jumps into PS1, + // attempt branch inversion. + if (!PBI1 || !PBI1->isUnconditional() || + (PS1->getTerminator()->getSuccessor(0) != PS2)) { + // Check whether PS2 jumps into PS1. + if (!PBI2 || !PBI2->isUnconditional() || + (PS2->getTerminator()->getSuccessor(0) != PS1)) + return false; + + // Do branch inversion. + BasicBlock *CurrBlock = LastCondBlock; + bool EverChanged = false; + while (1) { + BranchInst *BI = dyn_cast<BranchInst>(CurrBlock->getTerminator()); + CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition()); + CmpInst::Predicate Predicate = CI->getPredicate(); + // Cannonicalize icmp_ne -> icmp_eq, fcmp_one -> fcmp_oeq + if ((Predicate == CmpInst::ICMP_NE) || (Predicate == CmpInst::FCMP_ONE)) { + CI->setPredicate(ICmpInst::getInversePredicate(Predicate)); + BI->swapSuccessors(); + EverChanged = true; + } + if (CurrBlock == FirstCondBlock) + break; + CurrBlock = CurrBlock->getSinglePredecessor(); + } + return EverChanged; + } + + // PS1 must have a conditional branch. + if (!PBI1 || !PBI1->isUnconditional()) + return false; + + // PS2 should not contain PHI node. + PHI = dyn_cast<PHINode>(PS2->begin()); + if (PHI) + return false; + + // Do the transformation. + BasicBlock *CB; + BranchInst *PBI = dyn_cast<BranchInst>(FirstCondBlock->getTerminator()); + bool Iteration = true; + IRBuilder<>::InsertPointGuard Guard(Builder); + Value *PC = PBI->getCondition(); + + do { + CB = PBI->getSuccessor(1 - Idx); + // Delete the conditional branch. + FirstCondBlock->getInstList().pop_back(); + FirstCondBlock->getInstList() + .splice(FirstCondBlock->end(), CB->getInstList()); + PBI = cast<BranchInst>(FirstCondBlock->getTerminator()); + Value *CC = PBI->getCondition(); + // Merge conditions. + Builder.SetInsertPoint(PBI); + Value *NC; + if (Idx == 0) + // Case 2, use parallel or. + NC = Builder.CreateOr(PC, CC); + else + // Case 1, use parallel and. + NC = Builder.CreateAnd(PC, CC); + + PBI->replaceUsesOfWith(CC, NC); + PC = NC; + if (CB == LastCondBlock) + Iteration = false; + // Remove internal conditional branches. + CB->dropAllReferences(); + // make CB unreachable and let downstream to delete the block. + new UnreachableInst(CB->getContext(), CB); + } while (Iteration); + + DEBUG(dbgs() << "Use parallel and/or in:\n" << *FirstCondBlock); + return true; +} + +/// Compare blocks from two if-regions, where \param Head1 is the entry of the +/// 1st if-region. \param Head2 is the entry of the 2nd if-region. \param +/// Block1 is a block in the 1st if-region to compare. \param Block2 is a block +// in the 2nd if-region to compare. \returns true if \param Block1 and \param +/// Block2 have identical instructions and do not have memory reference alias +/// with \param Head2. +/// +bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, + BasicBlock *Block1, + BasicBlock *Block2) { + TerminatorInst *PTI2 = Head2->getTerminator(); + Instruction *PBI2 = Head2->begin(); + + bool eq1 = (Block1 == Head1); + bool eq2 = (Block2 == Head2); + if (eq1 || eq2) { + // An empty then-path or else-path. + return (eq1 == eq2); + } + + // Check whether instructions in Block1 and Block2 are identical + // and do not alias with instructions in Head2. + BasicBlock::iterator iter1 = Block1->begin(); + BasicBlock::iterator end1 = Block1->getTerminator(); + BasicBlock::iterator iter2 = Block2->begin(); + BasicBlock::iterator end2 = Block2->getTerminator(); + + while (1) { + if (iter1 == end1) { + if (iter2 != end2) + return false; + break; + } + + if (!iter1->isIdenticalTo(iter2)) + return false; + + // Illegal to remove instructions with side effects except + // non-volatile stores. + if (iter1->mayHaveSideEffects()) { + Instruction *CurI = &*iter1; + StoreInst *SI = dyn_cast<StoreInst>(CurI); + if (!SI || SI->isVolatile()) + return false; + } + + // For simplicity and speed, data dependency check can be + // avoided if read from memory doesn't exist. + if (iter1->mayReadFromMemory()) + return false; + + if (iter1->mayWriteToMemory()) { + for (BasicBlock::iterator BI = PBI2, BE = PTI2; BI != BE; ++BI) { + if (BI->mayReadFromMemory() || BI->mayWriteToMemory()) { + // Check alias with Head2. + if (!AA || AA->alias(iter1, BI)) + return false; + } + } + } + ++iter1; + ++iter2; + } + + return true; +} + +/// Check whether \param BB is the merge block of a if-region. If yes, check +/// whether there exists an adjacent if-region upstream, the two if-regions +/// contain identical instructions and can be legally merged. \returns true if +/// the two if-regions are merged. +/// +/// From: +/// if (a) +/// statement; +/// if (b) +/// statement; +/// +/// To: +/// if (a || b) +/// statement; +/// +bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, + Pass *P) { + BasicBlock *IfTrue2, *IfFalse2; + Value *IfCond2 = GetIfCondition(BB, IfTrue2, IfFalse2); + Instruction *CInst2 = dyn_cast_or_null<Instruction>(IfCond2); + if (!CInst2) + return false; + + BasicBlock *SecondEntryBlock = CInst2->getParent(); + if (SecondEntryBlock->hasAddressTaken()) + return false; + + BasicBlock *IfTrue1, *IfFalse1; + Value *IfCond1 = GetIfCondition(SecondEntryBlock, IfTrue1, IfFalse1); + Instruction *CInst1 = dyn_cast_or_null<Instruction>(IfCond1); + if (!CInst1) + return false; + + BasicBlock *FirstEntryBlock = CInst1->getParent(); + + // Either then-path or else-path should be empty. + if ((IfTrue1 != FirstEntryBlock) && (IfFalse1 != FirstEntryBlock)) + return false; + if ((IfTrue2 != SecondEntryBlock) && (IfFalse2 != SecondEntryBlock)) + return false; + + TerminatorInst *PTI2 = SecondEntryBlock->getTerminator(); + Instruction *PBI2 = SecondEntryBlock->begin(); + + if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfTrue1, + IfTrue2)) + return false; + + if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfFalse1, + IfFalse2)) + return false; + + // Check whether \param SecondEntryBlock has side-effect and is safe to + // speculate. + for (BasicBlock::iterator BI = PBI2, BE = PTI2; BI != BE; ++BI) { + Instruction *CI = BI; + if (isa<PHINode>(CI) || CI->mayHaveSideEffects() || + !isSafeToSpeculativelyExecute(CI)) + return false; + } + + // Merge \param SecondEntryBlock into \param FirstEntryBlock. + FirstEntryBlock->getInstList().pop_back(); + FirstEntryBlock->getInstList() + .splice(FirstEntryBlock->end(), SecondEntryBlock->getInstList()); + BranchInst *PBI = dyn_cast<BranchInst>(FirstEntryBlock->getTerminator()); + Value *CC = PBI->getCondition(); + BasicBlock *SaveInsertBB = Builder.GetInsertBlock(); + BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint(); + Builder.SetInsertPoint(PBI); + Value *NC = Builder.CreateOr(CInst1, CC); + PBI->replaceUsesOfWith(CC, NC); + Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt); + + // Remove IfTrue1 + if (IfTrue1 != FirstEntryBlock) { + IfTrue1->dropAllReferences(); + IfTrue1->eraseFromParent(); + } + + // Remove IfFalse1 + if (IfFalse1 != FirstEntryBlock) { + IfFalse1->dropAllReferences(); + IfFalse1->eraseFromParent(); + } + + // Remove \param SecondEntryBlock + SecondEntryBlock->dropAllReferences(); + SecondEntryBlock->eraseFromParent(); + DEBUG(dbgs() << "If conditions merged into:\n" << *FirstEntryBlock); + return true; +} + +bool FlattenCFGOpt::run(BasicBlock *BB) { + bool Changed = false; + assert(BB && BB->getParent() && "Block not embedded in function!"); + assert(BB->getTerminator() && "Degenerate basic block encountered!"); + + IRBuilder<> Builder(BB); + + if (FlattenParallelAndOr(BB, Builder)) + return true; + + if (MergeIfRegion(BB, Builder)) + return true; + + return Changed; +} + +/// FlattenCFG - This function is used to flatten a CFG. For +/// example, it uses parallel-and and parallel-or mode to collapse +// if-conditions and merge if-regions with identical statements. +/// +bool llvm::FlattenCFG(BasicBlock *BB, AliasAnalysis *AA) { + return FlattenCFGOpt(AA).run(BB); +} diff --git a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp new file mode 100644 index 0000000..5f0a563 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp @@ -0,0 +1,183 @@ +//===-- GlobalStatus.cpp - Compute status info for globals -----------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Transforms/Utils/GlobalStatus.h" + +using namespace llvm; + +/// Return the stronger of the two ordering. If the two orderings are acquire +/// and release, then return AcquireRelease. +/// +static AtomicOrdering strongerOrdering(AtomicOrdering X, AtomicOrdering Y) { + if (X == Acquire && Y == Release) + return AcquireRelease; + if (Y == Acquire && X == Release) + return AcquireRelease; + return (AtomicOrdering)std::max(X, Y); +} + +/// It is safe to destroy a constant iff it is only used by constants itself. +/// Note that constants cannot be cyclic, so this test is pretty easy to +/// implement recursively. +/// +bool llvm::isSafeToDestroyConstant(const Constant *C) { + if (isa<GlobalValue>(C)) + return false; + + for (Value::const_use_iterator UI = C->use_begin(), E = C->use_end(); UI != E; + ++UI) + if (const Constant *CU = dyn_cast<Constant>(*UI)) { + if (!isSafeToDestroyConstant(CU)) + return false; + } else + return false; + return true; +} + +static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS, + SmallPtrSet<const PHINode *, 16> &PhiUsers) { + for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; + ++UI) { + const User *U = *UI; + if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) { + GS.HasNonInstructionUser = true; + + // If the result of the constantexpr isn't pointer type, then we won't + // know to expect it in various places. Just reject early. + if (!isa<PointerType>(CE->getType())) + return true; + + if (analyzeGlobalAux(CE, GS, PhiUsers)) + return true; + } else if (const Instruction *I = dyn_cast<Instruction>(U)) { + if (!GS.HasMultipleAccessingFunctions) { + const Function *F = I->getParent()->getParent(); + if (GS.AccessingFunction == 0) + GS.AccessingFunction = F; + else if (GS.AccessingFunction != F) + GS.HasMultipleAccessingFunctions = true; + } + if (const LoadInst *LI = dyn_cast<LoadInst>(I)) { + GS.IsLoaded = true; + // Don't hack on volatile loads. + if (LI->isVolatile()) + return true; + GS.Ordering = strongerOrdering(GS.Ordering, LI->getOrdering()); + } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) { + // Don't allow a store OF the address, only stores TO the address. + if (SI->getOperand(0) == V) + return true; + + // Don't hack on volatile stores. + if (SI->isVolatile()) + return true; + + GS.Ordering = strongerOrdering(GS.Ordering, SI->getOrdering()); + + // If this is a direct store to the global (i.e., the global is a scalar + // value, not an aggregate), keep more specific information about + // stores. + if (GS.StoredType != GlobalStatus::Stored) { + if (const GlobalVariable *GV = + dyn_cast<GlobalVariable>(SI->getOperand(1))) { + Value *StoredVal = SI->getOperand(0); + + if (Constant *C = dyn_cast<Constant>(StoredVal)) { + if (C->isThreadDependent()) { + // The stored value changes between threads; don't track it. + return true; + } + } + + if (StoredVal == GV->getInitializer()) { + if (GS.StoredType < GlobalStatus::InitializerStored) + GS.StoredType = GlobalStatus::InitializerStored; + } else if (isa<LoadInst>(StoredVal) && + cast<LoadInst>(StoredVal)->getOperand(0) == GV) { + if (GS.StoredType < GlobalStatus::InitializerStored) + GS.StoredType = GlobalStatus::InitializerStored; + } else if (GS.StoredType < GlobalStatus::StoredOnce) { + GS.StoredType = GlobalStatus::StoredOnce; + GS.StoredOnceValue = StoredVal; + } else if (GS.StoredType == GlobalStatus::StoredOnce && + GS.StoredOnceValue == StoredVal) { + // noop. + } else { + GS.StoredType = GlobalStatus::Stored; + } + } else { + GS.StoredType = GlobalStatus::Stored; + } + } + } else if (isa<BitCastInst>(I)) { + if (analyzeGlobalAux(I, GS, PhiUsers)) + return true; + } else if (isa<GetElementPtrInst>(I)) { + if (analyzeGlobalAux(I, GS, PhiUsers)) + return true; + } else if (isa<SelectInst>(I)) { + if (analyzeGlobalAux(I, GS, PhiUsers)) + return true; + } else if (const PHINode *PN = dyn_cast<PHINode>(I)) { + // PHI nodes we can check just like select or GEP instructions, but we + // have to be careful about infinite recursion. + if (PhiUsers.insert(PN)) // Not already visited. + if (analyzeGlobalAux(I, GS, PhiUsers)) + return true; + } else if (isa<CmpInst>(I)) { + GS.IsCompared = true; + } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) { + if (MTI->isVolatile()) + return true; + if (MTI->getArgOperand(0) == V) + GS.StoredType = GlobalStatus::Stored; + if (MTI->getArgOperand(1) == V) + GS.IsLoaded = true; + } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) { + assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!"); + if (MSI->isVolatile()) + return true; + GS.StoredType = GlobalStatus::Stored; + } else if (ImmutableCallSite C = I) { + if (!C.isCallee(UI)) + return true; + GS.IsLoaded = true; + } else { + return true; // Any other non-load instruction might take address! + } + } else if (const Constant *C = dyn_cast<Constant>(U)) { + GS.HasNonInstructionUser = true; + // We might have a dead and dangling constant hanging off of here. + if (!isSafeToDestroyConstant(C)) + return true; + } else { + GS.HasNonInstructionUser = true; + // Otherwise must be some other user. + return true; + } + } + + return false; +} + +bool GlobalStatus::analyzeGlobal(const Value *V, GlobalStatus &GS) { + SmallPtrSet<const PHINode *, 16> PhiUsers; + return analyzeGlobalAux(V, GS, PhiUsers); +} + +GlobalStatus::GlobalStatus() + : IsCompared(false), IsLoaded(false), StoredType(NotStored), + StoredOnceValue(0), AccessingFunction(0), + HasMultipleAccessingFunctions(false), HasNonInstructionUser(false), + Ordering(NotAtomic) {} diff --git a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp index dabb67b9..d021bce 100644 --- a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -193,7 +193,8 @@ static bool HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, CallInst *CI = dyn_cast<CallInst>(I); // If this call cannot unwind, don't convert it to an invoke. - if (!CI || CI->doesNotThrow()) + // Inline asm calls cannot throw. + if (!CI || CI->doesNotThrow() || isa<InlineAsm>(CI->getCalledValue())) continue; // Convert this function call into an invoke instruction. First, split the diff --git a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp index 2d1b166..f15e8d5 100644 --- a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp @@ -55,7 +55,6 @@ namespace { DominatorTree *DT; LoopInfo *LI; ScalarEvolution *SE; - std::vector<BasicBlock*> LoopBlocks; PredIteratorCache PredCache; Loop *L; @@ -82,11 +81,6 @@ namespace { // Check the special guarantees that LCSSA makes. assert(L->isLCSSAForm(*DT) && "LCSSA form not preserved!"); } - - /// inLoop - returns true if the given block is within the current loop - bool inLoop(BasicBlock *B) const { - return std::binary_search(LoopBlocks.begin(), LoopBlocks.end(), B); - } }; } @@ -129,11 +123,6 @@ bool LCSSA::runOnLoop(Loop *TheLoop, LPPassManager &LPM) { if (ExitBlocks.empty()) return false; - // Speed up queries by creating a sorted vector of blocks. - LoopBlocks.clear(); - LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end()); - array_pod_sort(LoopBlocks.begin(), LoopBlocks.end()); - // Look at all the instructions in the loop, checking to see if they have uses // outside the loop. If so, rewrite those uses. bool MadeChange = false; @@ -198,7 +187,7 @@ bool LCSSA::ProcessInstruction(Instruction *Inst, if (PHINode *PN = dyn_cast<PHINode>(U)) UserBB = PN->getIncomingBlock(UI); - if (InstBB != UserBB && !inLoop(UserBB)) + if (InstBB != UserBB && !L->contains(UserBB)) UsesToRewrite.push_back(&UI.getUse()); } @@ -244,7 +233,7 @@ bool LCSSA::ProcessInstruction(Instruction *Inst, // If the exit block has a predecessor not within the loop, arrange for // the incoming value use corresponding to that predecessor to be // rewritten in terms of a different LCSSA PHI. - if (!inLoop(*PI)) + if (!L->contains(*PI)) UsesToRewrite.push_back( &PN->getOperandUse( PN->getOperandNumForIncomingValue(PN->getNumIncomingValues()-1))); diff --git a/contrib/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm/lib/Transforms/Utils/Local.cpp index 12e5b3e..2768041 100644 --- a/contrib/llvm/lib/Transforms/Utils/Local.cpp +++ b/contrib/llvm/lib/Transforms/Utils/Local.cpp @@ -16,10 +16,10 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" -#include "llvm/Analysis/ProfileInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/DIBuilder.h" #include "llvm/DebugInfo.h" @@ -43,6 +43,8 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; +STATISTIC(NumRemoved, "Number of unreachable basic blocks removed"); + //===----------------------------------------------------------------------===// // Local constant propagation. // @@ -84,7 +86,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, BI->eraseFromParent(); return true; } - + if (Dest2 == Dest1) { // Conditional branch to same location? // This branch matches something like this: // br bool %cond, label %Dest, label %Dest @@ -104,7 +106,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, } return false; } - + if (SwitchInst *SI = dyn_cast<SwitchInst>(T)) { // If we are switching on a constant, we can convert the switch into a // single branch instruction! @@ -188,38 +190,33 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI); return true; } - + if (SI->getNumCases() == 1) { // Otherwise, we can fold this switch into a conditional branch // instruction if it has only one non-default destination. SwitchInst::CaseIt FirstCase = SI->case_begin(); - IntegersSubset& Case = FirstCase.getCaseValueEx(); - if (Case.isSingleNumber()) { - // FIXME: Currently work with ConstantInt based numbers. - Value *Cond = Builder.CreateICmpEQ(SI->getCondition(), - Case.getSingleNumber(0).toConstantInt(), - "cond"); - - // Insert the new branch. - BranchInst *NewBr = Builder.CreateCondBr(Cond, - FirstCase.getCaseSuccessor(), - SI->getDefaultDest()); - MDNode* MD = SI->getMetadata(LLVMContext::MD_prof); - if (MD && MD->getNumOperands() == 3) { - ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2)); - ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1)); - assert(SICase && SIDef); - // The TrueWeight should be the weight for the single case of SI. - NewBr->setMetadata(LLVMContext::MD_prof, - MDBuilder(BB->getContext()). - createBranchWeights(SICase->getValue().getZExtValue(), - SIDef->getValue().getZExtValue())); - } + Value *Cond = Builder.CreateICmpEQ(SI->getCondition(), + FirstCase.getCaseValue(), "cond"); - // Delete the old switch. - SI->eraseFromParent(); - return true; + // Insert the new branch. + BranchInst *NewBr = Builder.CreateCondBr(Cond, + FirstCase.getCaseSuccessor(), + SI->getDefaultDest()); + MDNode* MD = SI->getMetadata(LLVMContext::MD_prof); + if (MD && MD->getNumOperands() == 3) { + ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2)); + ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1)); + assert(SICase && SIDef); + // The TrueWeight should be the weight for the single case of SI. + NewBr->setMetadata(LLVMContext::MD_prof, + MDBuilder(BB->getContext()). + createBranchWeights(SICase->getValue().getZExtValue(), + SIDef->getValue().getZExtValue())); } + + // Delete the old switch. + SI->eraseFromParent(); + return true; } return false; } @@ -231,7 +228,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, BasicBlock *TheOnlyDest = BA->getBasicBlock(); // Insert the new branch. Builder.CreateBr(TheOnlyDest); - + for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) { if (IBI->getDestination(i) == TheOnlyDest) TheOnlyDest = 0; @@ -242,7 +239,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, IBI->eraseFromParent(); if (DeleteDeadConditions) RecursivelyDeleteTriviallyDeadInstructions(Address, TLI); - + // If we didn't find our destination in the IBI successor list, then we // have undefined behavior. Replace the unconditional branch with an // 'unreachable' instruction. @@ -250,11 +247,11 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, BB->getTerminator()->eraseFromParent(); new UnreachableInst(BB->getContext(), BB); } - + return true; } } - + return false; } @@ -321,10 +318,10 @@ llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V, Instruction *I = dyn_cast<Instruction>(V); if (!I || !I->use_empty() || !isInstructionTriviallyDead(I, TLI)) return false; - + SmallVector<Instruction*, 16> DeadInsts; DeadInsts.push_back(I); - + do { I = DeadInsts.pop_back_val(); @@ -333,9 +330,9 @@ llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V, for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { Value *OpV = I->getOperand(i); I->setOperand(i, 0); - + if (!OpV->use_empty()) continue; - + // If the operand is an instruction that became dead as we nulled out the // operand, and if it is 'trivially' dead, delete it in a future loop // iteration. @@ -343,7 +340,7 @@ llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V, if (isInstructionTriviallyDead(OpI, TLI)) DeadInsts.push_back(OpI); } - + I->eraseFromParent(); } while (!DeadInsts.empty()); @@ -415,7 +412,7 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD, Instruction *Inst = BI++; WeakVH BIHandle(BI); - if (recursivelySimplifyInstruction(Inst, TD)) { + if (recursivelySimplifyInstruction(Inst, TD, TLI)) { MadeChange = true; if (BIHandle != BI) BI = BB->begin(); @@ -450,12 +447,12 @@ void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred, // This only adjusts blocks with PHI nodes. if (!isa<PHINode>(BB->begin())) return; - + // Remove the entries for Pred from the PHI nodes in BB, but do not simplify // them down. This will leave us with single entry phi nodes and other phis // that can be removed. BB->removePredecessor(Pred, true); - + WeakVH PhiIt = &BB->front(); while (PHINode *PN = dyn_cast<PHINode>(PhiIt)) { PhiIt = &*++BasicBlock::iterator(cast<Instruction>(PhiIt)); @@ -486,10 +483,10 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) { PN->replaceAllUsesWith(NewVal); PN->eraseFromParent(); } - + BasicBlock *PredBB = DestBB->getSinglePredecessor(); assert(PredBB && "Block doesn't have a single predecessor!"); - + // Zap anything that took the address of DestBB. Not doing this will give the // address an invalid value. if (DestBB->hasAddressTaken()) { @@ -500,10 +497,10 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) { BA->getType())); BA->destroyConstant(); } - + // Anything that branched to PredBB now branches to DestBB. PredBB->replaceAllUsesWith(DestBB); - + // Splice all the instructions from PredBB to DestBB. PredBB->getTerminator()->eraseFromParent(); DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList()); @@ -515,25 +512,27 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) { DT->changeImmediateDominator(DestBB, PredBBIDom); DT->eraseNode(PredBB); } - ProfileInfo *PI = P->getAnalysisIfAvailable<ProfileInfo>(); - if (PI) { - PI->replaceAllUses(PredBB, DestBB); - PI->removeEdge(ProfileInfo::getEdge(PredBB, DestBB)); - } } // Nuke BB. PredBB->eraseFromParent(); } +/// CanMergeValues - Return true if we can choose one of these values to use +/// in place of the other. Note that we will always choose the non-undef +/// value to keep. +static bool CanMergeValues(Value *First, Value *Second) { + return First == Second || isa<UndefValue>(First) || isa<UndefValue>(Second); +} + /// CanPropagatePredecessorsForPHIs - Return true if we can fold BB, an -/// almost-empty BB ending in an unconditional branch to Succ, into succ. +/// almost-empty BB ending in an unconditional branch to Succ, into Succ. /// /// Assumption: Succ is the single successor for BB. /// static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) { assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!"); - DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into " + DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into " << Succ->getName() << "\n"); // Shortcut, if there is only a single predecessor it must be BB and merging // is always safe @@ -555,9 +554,10 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) { for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) { BasicBlock *IBB = PN->getIncomingBlock(PI); if (BBPreds.count(IBB) && - BBPN->getIncomingValueForBlock(IBB) != PN->getIncomingValue(PI)) { - DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in " - << Succ->getName() << " is conflicting with " + !CanMergeValues(BBPN->getIncomingValueForBlock(IBB), + PN->getIncomingValue(PI))) { + DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in " + << Succ->getName() << " is conflicting with " << BBPN->getName() << " with regard to common predecessor " << IBB->getName() << "\n"); return false; @@ -570,8 +570,9 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) { // one for BB, in which case this phi node will not prevent the merging // of the block. BasicBlock *IBB = PN->getIncomingBlock(PI); - if (BBPreds.count(IBB) && Val != PN->getIncomingValue(PI)) { - DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in " + if (BBPreds.count(IBB) && + !CanMergeValues(Val, PN->getIncomingValue(PI))) { + DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in " << Succ->getName() << " is conflicting with regard to common " << "predecessor " << IBB->getName() << "\n"); return false; @@ -583,6 +584,139 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) { return true; } +typedef SmallVector<BasicBlock *, 16> PredBlockVector; +typedef DenseMap<BasicBlock *, Value *> IncomingValueMap; + +/// \brief Determines the value to use as the phi node input for a block. +/// +/// Select between \p OldVal any value that we know flows from \p BB +/// to a particular phi on the basis of which one (if either) is not +/// undef. Update IncomingValues based on the selected value. +/// +/// \param OldVal The value we are considering selecting. +/// \param BB The block that the value flows in from. +/// \param IncomingValues A map from block-to-value for other phi inputs +/// that we have examined. +/// +/// \returns the selected value. +static Value *selectIncomingValueForBlock(Value *OldVal, BasicBlock *BB, + IncomingValueMap &IncomingValues) { + if (!isa<UndefValue>(OldVal)) { + assert((!IncomingValues.count(BB) || + IncomingValues.find(BB)->second == OldVal) && + "Expected OldVal to match incoming value from BB!"); + + IncomingValues.insert(std::make_pair(BB, OldVal)); + return OldVal; + } + + IncomingValueMap::const_iterator It = IncomingValues.find(BB); + if (It != IncomingValues.end()) return It->second; + + return OldVal; +} + +/// \brief Create a map from block to value for the operands of a +/// given phi. +/// +/// Create a map from block to value for each non-undef value flowing +/// into \p PN. +/// +/// \param PN The phi we are collecting the map for. +/// \param IncomingValues [out] The map from block to value for this phi. +static void gatherIncomingValuesToPhi(PHINode *PN, + IncomingValueMap &IncomingValues) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *BB = PN->getIncomingBlock(i); + Value *V = PN->getIncomingValue(i); + + if (!isa<UndefValue>(V)) + IncomingValues.insert(std::make_pair(BB, V)); + } +} + +/// \brief Replace the incoming undef values to a phi with the values +/// from a block-to-value map. +/// +/// \param PN The phi we are replacing the undefs in. +/// \param IncomingValues A map from block to value. +static void replaceUndefValuesInPhi(PHINode *PN, + const IncomingValueMap &IncomingValues) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *V = PN->getIncomingValue(i); + + if (!isa<UndefValue>(V)) continue; + + BasicBlock *BB = PN->getIncomingBlock(i); + IncomingValueMap::const_iterator It = IncomingValues.find(BB); + if (It == IncomingValues.end()) continue; + + PN->setIncomingValue(i, It->second); + } +} + +/// \brief Replace a value flowing from a block to a phi with +/// potentially multiple instances of that value flowing from the +/// block's predecessors to the phi. +/// +/// \param BB The block with the value flowing into the phi. +/// \param BBPreds The predecessors of BB. +/// \param PN The phi that we are updating. +static void redirectValuesFromPredecessorsToPhi(BasicBlock *BB, + const PredBlockVector &BBPreds, + PHINode *PN) { + Value *OldVal = PN->removeIncomingValue(BB, false); + assert(OldVal && "No entry in PHI for Pred BB!"); + + IncomingValueMap IncomingValues; + + // We are merging two blocks - BB, and the block containing PN - and + // as a result we need to redirect edges from the predecessors of BB + // to go to the block containing PN, and update PN + // accordingly. Since we allow merging blocks in the case where the + // predecessor and successor blocks both share some predecessors, + // and where some of those common predecessors might have undef + // values flowing into PN, we want to rewrite those values to be + // consistent with the non-undef values. + + gatherIncomingValuesToPhi(PN, IncomingValues); + + // If this incoming value is one of the PHI nodes in BB, the new entries + // in the PHI node are the entries from the old PHI. + if (isa<PHINode>(OldVal) && cast<PHINode>(OldVal)->getParent() == BB) { + PHINode *OldValPN = cast<PHINode>(OldVal); + for (unsigned i = 0, e = OldValPN->getNumIncomingValues(); i != e; ++i) { + // Note that, since we are merging phi nodes and BB and Succ might + // have common predecessors, we could end up with a phi node with + // identical incoming branches. This will be cleaned up later (and + // will trigger asserts if we try to clean it up now, without also + // simplifying the corresponding conditional branch). + BasicBlock *PredBB = OldValPN->getIncomingBlock(i); + Value *PredVal = OldValPN->getIncomingValue(i); + Value *Selected = selectIncomingValueForBlock(PredVal, PredBB, + IncomingValues); + + // And add a new incoming value for this predecessor for the + // newly retargeted branch. + PN->addIncoming(Selected, PredBB); + } + } else { + for (unsigned i = 0, e = BBPreds.size(); i != e; ++i) { + // Update existing incoming values in PN for this + // predecessor of BB. + BasicBlock *PredBB = BBPreds[i]; + Value *Selected = selectIncomingValueForBlock(OldVal, PredBB, + IncomingValues); + + // And add a new incoming value for this predecessor for the + // newly retargeted branch. + PN->addIncoming(Selected, PredBB); + } + } + + replaceUndefValuesInPhi(PN, IncomingValues); +} + /// TryToSimplifyUncondBranchFromEmptyBlock - BB is known to contain an /// unconditional branch, and contains no instructions other than PHI nodes, /// potential side-effect free intrinsics and the branch. If possible, @@ -595,7 +729,7 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) { // We can't eliminate infinite loops. BasicBlock *Succ = cast<BranchInst>(BB->getTerminator())->getSuccessor(0); if (BB == Succ) return false; - + // Check to see if merging these blocks would cause conflicts for any of the // phi nodes in BB or Succ. If not, we can safely merge. if (!CanPropagatePredecessorsForPHIs(BB, Succ)) return false; @@ -629,39 +763,21 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) { } DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB); - + if (isa<PHINode>(Succ->begin())) { // If there is more than one pred of succ, and there are PHI nodes in // the successor, then we need to add incoming edges for the PHI nodes // - const SmallVector<BasicBlock*, 16> BBPreds(pred_begin(BB), pred_end(BB)); - + const PredBlockVector BBPreds(pred_begin(BB), pred_end(BB)); + // Loop over all of the PHI nodes in the successor of BB. for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) { PHINode *PN = cast<PHINode>(I); - Value *OldVal = PN->removeIncomingValue(BB, false); - assert(OldVal && "No entry in PHI for Pred BB!"); - - // If this incoming value is one of the PHI nodes in BB, the new entries - // in the PHI node are the entries from the old PHI. - if (isa<PHINode>(OldVal) && cast<PHINode>(OldVal)->getParent() == BB) { - PHINode *OldValPN = cast<PHINode>(OldVal); - for (unsigned i = 0, e = OldValPN->getNumIncomingValues(); i != e; ++i) - // Note that, since we are merging phi nodes and BB and Succ might - // have common predecessors, we could end up with a phi node with - // identical incoming branches. This will be cleaned up later (and - // will trigger asserts if we try to clean it up now, without also - // simplifying the corresponding conditional branch). - PN->addIncoming(OldValPN->getIncomingValue(i), - OldValPN->getIncomingBlock(i)); - } else { - // Add an incoming value for each of the new incoming values. - for (unsigned i = 0, e = BBPreds.size(); i != e; ++i) - PN->addIncoming(OldVal, BBPreds[i]); - } + + redirectValuesFromPredecessorsToPhi(BB, BBPreds, PN); } } - + if (Succ->getSinglePredecessor()) { // BB is the only predecessor of Succ, so Succ will end up with exactly // the same predecessors BB had. @@ -676,7 +792,7 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) { PN->eraseFromParent(); } } - + // Everything that jumped to BB now goes to Succ. BB->replaceAllUsesWith(Succ); if (!Succ->hasName()) Succ->takeName(BB); @@ -784,7 +900,7 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align, // the final program then it is impossible for us to reliably enforce the // preferred alignment. if (GV->isWeakForLinker()) return Align; - + if (GV->getAlignment() >= PrefAlign) return GV->getAlignment(); // We can only increase the alignment of the global if it has no alignment @@ -804,26 +920,27 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align, /// and it is more than the alignment of the ultimate object, see if we can /// increase the alignment of the ultimate object, making this check succeed. unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, - const DataLayout *TD) { + const DataLayout *DL) { assert(V->getType()->isPointerTy() && "getOrEnforceKnownAlignment expects a pointer!"); - unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 64; + unsigned BitWidth = DL ? DL->getPointerTypeSizeInBits(V->getType()) : 64; + APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - ComputeMaskedBits(V, KnownZero, KnownOne, TD); + ComputeMaskedBits(V, KnownZero, KnownOne, DL); unsigned TrailZ = KnownZero.countTrailingOnes(); - - // Avoid trouble with rediculously large TrailZ values, such as + + // Avoid trouble with ridiculously large TrailZ values, such as // those computed from a null pointer. TrailZ = std::min(TrailZ, unsigned(sizeof(unsigned) * CHAR_BIT - 1)); - + unsigned Align = 1u << std::min(BitWidth - 1, TrailZ); - + // LLVM doesn't support alignments larger than this currently. Align = std::min(Align, +Value::MaximumAlignment); - + if (PrefAlign > Align) - Align = enforceKnownAlignment(V, Align, PrefAlign, TD); - + Align = enforceKnownAlignment(V, Align, PrefAlign, DL); + // We don't need to make any adjustment. return Align; } @@ -854,7 +971,9 @@ static bool LdStHasDebugValue(DIVariable &DIVar, Instruction *I) { bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, StoreInst *SI, DIBuilder &Builder) { DIVariable DIVar(DDI->getVariable()); - if (!DIVar.Verify()) + assert((!DIVar || DIVar.isVariable()) && + "Variable in DbgDeclareInst should be either null or a DIVariable."); + if (!DIVar) return false; if (LdStHasDebugValue(DIVar, SI)) @@ -888,16 +1007,18 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, LoadInst *LI, DIBuilder &Builder) { DIVariable DIVar(DDI->getVariable()); - if (!DIVar.Verify()) + assert((!DIVar || DIVar.isVariable()) && + "Variable in DbgDeclareInst should be either null or a DIVariable."); + if (!DIVar) return false; if (LdStHasDebugValue(DIVar, LI)) return true; - Instruction *DbgVal = + Instruction *DbgVal = Builder.insertDbgValueIntrinsic(LI->getOperand(0), 0, DIVar, LI); - + // Propagate any debug metadata from the store onto the dbg.value. DebugLoc LIDL = LI->getDebugLoc(); if (!LIDL.isUnknown()) @@ -921,10 +1042,14 @@ bool llvm::LowerDbgDeclare(Function &F) { if (Dbgs.empty()) return false; - for (SmallVector<DbgDeclareInst *, 4>::iterator I = Dbgs.begin(), + for (SmallVectorImpl<DbgDeclareInst *>::iterator I = Dbgs.begin(), E = Dbgs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; - if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress())) { + AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress()); + // If this is an alloca for a scalar variable, insert a dbg.value + // at each load and store to the alloca and erase the dbg.declare. + if (AI && !AI->isArrayAllocation()) { + // We only remove the dbg.declare intrinsic if all uses are // converted to dbg.value intrinsics. bool RemoveDDI = true; @@ -961,7 +1086,9 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, if (!DDI) return false; DIVariable DIVar(DDI->getVariable()); - if (!DIVar.Verify()) + assert((!DIVar || DIVar.isVariable()) && + "Variable in DbgDeclareInst should be either null or a DIVariable."); + if (!DIVar) return false; // Create a copy of the original DIDescriptor for user variable, appending @@ -990,33 +1117,153 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, return true; } -bool llvm::removeUnreachableBlocks(Function &F) { - SmallPtrSet<BasicBlock*, 16> Reachable; +/// changeToUnreachable - Insert an unreachable instruction before the specified +/// instruction, making it and the rest of the code in the block dead. +static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) { + BasicBlock *BB = I->getParent(); + // Loop over all of the successors, removing BB's entry from any PHI + // nodes. + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) + (*SI)->removePredecessor(BB); + + // Insert a call to llvm.trap right before this. This turns the undefined + // behavior into a hard fail instead of falling through into random code. + if (UseLLVMTrap) { + Function *TrapFn = + Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap); + CallInst *CallTrap = CallInst::Create(TrapFn, "", I); + CallTrap->setDebugLoc(I->getDebugLoc()); + } + new UnreachableInst(I->getContext(), I); + + // All instructions after this are dead. + BasicBlock::iterator BBI = I, BBE = BB->end(); + while (BBI != BBE) { + if (!BBI->use_empty()) + BBI->replaceAllUsesWith(UndefValue::get(BBI->getType())); + BB->getInstList().erase(BBI++); + } +} + +/// changeToCall - Convert the specified invoke into a normal call. +static void changeToCall(InvokeInst *II) { + SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); + CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II); + NewCall->takeName(II); + NewCall->setCallingConv(II->getCallingConv()); + NewCall->setAttributes(II->getAttributes()); + NewCall->setDebugLoc(II->getDebugLoc()); + II->replaceAllUsesWith(NewCall); + + // Follow the call by a branch to the normal destination. + BranchInst::Create(II->getNormalDest(), II); + + // Update PHI nodes in the unwind destination + II->getUnwindDest()->removePredecessor(II->getParent()); + II->eraseFromParent(); +} + +static bool markAliveBlocks(BasicBlock *BB, + SmallPtrSet<BasicBlock*, 128> &Reachable) { + SmallVector<BasicBlock*, 128> Worklist; - Worklist.push_back(&F.getEntryBlock()); - Reachable.insert(&F.getEntryBlock()); + Worklist.push_back(BB); + Reachable.insert(BB); + bool Changed = false; do { - BasicBlock *BB = Worklist.pop_back_val(); + BB = Worklist.pop_back_val(); + + // Do a quick scan of the basic block, turning any obviously unreachable + // instructions into LLVM unreachable insts. The instruction combining pass + // canonicalizes unreachable insts into stores to null or undef. + for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){ + if (CallInst *CI = dyn_cast<CallInst>(BBI)) { + if (CI->doesNotReturn()) { + // If we found a call to a no-return function, insert an unreachable + // instruction after it. Make sure there isn't *already* one there + // though. + ++BBI; + if (!isa<UnreachableInst>(BBI)) { + // Don't insert a call to llvm.trap right before the unreachable. + changeToUnreachable(BBI, false); + Changed = true; + } + break; + } + } + + // Store to undef and store to null are undefined and used to signal that + // they should be changed to unreachable by passes that can't modify the + // CFG. + if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) { + // Don't touch volatile stores. + if (SI->isVolatile()) continue; + + Value *Ptr = SI->getOperand(1); + + if (isa<UndefValue>(Ptr) || + (isa<ConstantPointerNull>(Ptr) && + SI->getPointerAddressSpace() == 0)) { + changeToUnreachable(SI, true); + Changed = true; + break; + } + } + } + + // Turn invokes that call 'nounwind' functions into ordinary calls. + if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) { + Value *Callee = II->getCalledValue(); + if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { + changeToUnreachable(II, true); + Changed = true; + } else if (II->doesNotThrow()) { + if (II->use_empty() && II->onlyReadsMemory()) { + // jump to the normal destination branch. + BranchInst::Create(II->getNormalDest(), II); + II->getUnwindDest()->removePredecessor(II->getParent()); + II->eraseFromParent(); + } else + changeToCall(II); + Changed = true; + } + } + + Changed |= ConstantFoldTerminator(BB, true); for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) if (Reachable.insert(*SI)) Worklist.push_back(*SI); } while (!Worklist.empty()); + return Changed; +} + +/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even +/// if they are in a dead cycle. Return true if a change was made, false +/// otherwise. +bool llvm::removeUnreachableBlocks(Function &F) { + SmallPtrSet<BasicBlock*, 128> Reachable; + bool Changed = markAliveBlocks(F.begin(), Reachable); + // If there are unreachable blocks in the CFG... if (Reachable.size() == F.size()) - return false; + return Changed; assert(Reachable.size() < F.size()); - for (Function::iterator I = llvm::next(F.begin()), E = F.end(); I != E; ++I) { - if (Reachable.count(I)) + NumRemoved += F.size()-Reachable.size(); + + // Loop over all of the basic blocks that are not reachable, dropping all of + // their internal references... + for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) { + if (Reachable.count(BB)) continue; - for (succ_iterator SI = succ_begin(I), SE = succ_end(I); SI != SE; ++SI) + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) if (Reachable.count(*SI)) - (*SI)->removePredecessor(I); - I->dropAllReferences(); + (*SI)->removePredecessor(BB); + BB->dropAllReferences(); } - for (Function::iterator I = llvm::next(F.begin()), E=F.end(); I != E;) + for (Function::iterator I = ++F.begin(); I != F.end();) if (!Reachable.count(I)) I = F.getBasicBlockList().erase(I); else diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp index 37819cc..6d5f16c 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -59,6 +59,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted"); @@ -100,16 +101,16 @@ namespace { private: bool ProcessLoop(Loop *L, LPPassManager &LPM); BasicBlock *RewriteLoopExitBlock(Loop *L, BasicBlock *Exit); - BasicBlock *InsertPreheaderForLoop(Loop *L); Loop *SeparateNestedLoop(Loop *L, LPPassManager &LPM, BasicBlock *Preheader); BasicBlock *InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader); - void PlaceSplitBlockCarefully(BasicBlock *NewBB, - SmallVectorImpl<BasicBlock*> &SplitPreds, - Loop *L); }; } +static void PlaceSplitBlockCarefully(BasicBlock *NewBB, + SmallVectorImpl<BasicBlock*> &SplitPreds, + Loop *L); + char LoopSimplify::ID = 0; INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify", "Canonicalize natural loops", true, false) @@ -208,7 +209,7 @@ ReprocessLoop: // Does the loop already have a preheader? If so, don't insert one. BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { - Preheader = InsertPreheaderForLoop(L); + Preheader = InsertPreheaderForLoop(L, this); if (Preheader) { ++NumInserted; Changed = true; @@ -367,7 +368,7 @@ ReprocessLoop: /// preheader, this method is called to insert one. This method has two phases: /// preheader insertion and analysis updating. /// -BasicBlock *LoopSimplify::InsertPreheaderForLoop(Loop *L) { +BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) { BasicBlock *Header = L->getHeader(); // Compute the set of predecessors of the loop that are not in the loop. @@ -390,11 +391,11 @@ BasicBlock *LoopSimplify::InsertPreheaderForLoop(Loop *L) { BasicBlock *PreheaderBB; if (!Header->isLandingPad()) { PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", - this); + PP); } else { SmallVector<BasicBlock*, 2> NewBBs; SplitLandingPadPredecessors(Header, OutsideBlocks, ".preheader", - ".split-lp", this, NewBBs); + ".split-lp", PP, NewBBs); PreheaderBB = NewBBs[0]; } @@ -491,9 +492,9 @@ static PHINode *FindPHIToPartitionLoops(Loop *L, DominatorTree *DT, // PlaceSplitBlockCarefully - If the block isn't already, move the new block to // right after some 'outside block' block. This prevents the preheader from // being placed inside the loop body, e.g. when the loop hasn't been rotated. -void LoopSimplify::PlaceSplitBlockCarefully(BasicBlock *NewBB, - SmallVectorImpl<BasicBlock*> &SplitPreds, - Loop *L) { +void PlaceSplitBlockCarefully(BasicBlock *NewBB, + SmallVectorImpl<BasicBlock*> &SplitPreds, + Loop *L) { // Check to see if NewBB is already well placed. Function::iterator BBI = NewBB; --BBI; for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) { diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp index cb581b3..162807d 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -90,7 +90,8 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, // Move all definitions in the successor to the predecessor... OnlyPred->getInstList().splice(OnlyPred->end(), BB->getInstList()); - std::string OldName = BB->getName(); + // OldName will be valid until erased. + StringRef OldName = BB->getName(); // Erase basic block from the function... @@ -102,12 +103,13 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, } } LI->removeBlock(BB); - BB->eraseFromParent(); // Inherit predecessor's name if it exists... if (!OldName.empty() && !OnlyPred->hasName()) OnlyPred->setName(OldName); + BB->eraseFromParent(); + return OnlyPred; } @@ -239,8 +241,6 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, DEBUG(dbgs() << "!\n"); } - std::vector<BasicBlock*> LoopBlocks = L->getBlocks(); - bool ContinueOnTrue = L->contains(BI->getSuccessor(0)); BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue); diff --git a/contrib/llvm/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Utils/LowerExpectIntrinsic.cpp index 4aee8ff..e017f50 100644 --- a/contrib/llvm/lib/Transforms/Utils/LowerExpectIntrinsic.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LowerExpectIntrinsic.cpp @@ -29,7 +29,7 @@ using namespace llvm; -STATISTIC(IfHandled, "Number of 'expect' intrinsic intructions handled"); +STATISTIC(IfHandled, "Number of 'expect' intrinsic instructions handled"); static cl::opt<uint32_t> LikelyBranchWeight("likely-branch-weight", cl::Hidden, cl::init(64), diff --git a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp index 9ec84d7..9799a30 100644 --- a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp @@ -61,6 +61,8 @@ static cl::opt<bool> ExpensiveEHSupport("enable-correct-eh-support", namespace { class LowerInvoke : public FunctionPass { + const TargetMachine *TM; + // Used for both models. Constant *AbortFn; @@ -70,15 +72,12 @@ namespace { Constant *SetJmpFn, *LongJmpFn, *StackSaveFn, *StackRestoreFn; bool useExpensiveEHSupport; - // We peek in TLI to grab the target's jmp_buf size and alignment - const TargetLowering *TLI; - public: static char ID; // Pass identification, replacement for typeid - explicit LowerInvoke(const TargetLowering *tli = NULL, + explicit LowerInvoke(const TargetMachine *TM = 0, bool useExpensiveEHSupport = ExpensiveEHSupport) - : FunctionPass(ID), useExpensiveEHSupport(useExpensiveEHSupport), - TLI(tli) { + : FunctionPass(ID), TM(TM), + useExpensiveEHSupport(useExpensiveEHSupport) { initializeLowerInvokePass(*PassRegistry::getPassRegistry()); } bool doInitialization(Module &M); @@ -108,12 +107,9 @@ INITIALIZE_PASS(LowerInvoke, "lowerinvoke", char &llvm::LowerInvokePassID = LowerInvoke::ID; // Public Interface To the LowerInvoke pass. -FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI) { - return new LowerInvoke(TLI, ExpensiveEHSupport); -} -FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI, +FunctionPass *llvm::createLowerInvokePass(const TargetMachine *TM, bool useExpensiveEHSupport) { - return new LowerInvoke(TLI, useExpensiveEHSupport); + return new LowerInvoke(TM, useExpensiveEHSupport || ExpensiveEHSupport); } // doInitialization - Make sure that there is a prototype for abort in the @@ -122,6 +118,7 @@ bool LowerInvoke::doInitialization(Module &M) { Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext()); if (useExpensiveEHSupport) { // Insert a type for the linked list of jump buffers. + const TargetLowering *TLI = TM ? TM->getTargetLowering() : 0; unsigned JBSize = TLI ? TLI->getJumpBufSize() : 0; JBSize = JBSize ? JBSize : 200; Type *JmpBufTy = ArrayType::get(VoidPtrTy, JBSize); @@ -349,7 +346,6 @@ splitLiveRangesLiveAcrossInvokes(SmallVectorImpl<InvokeInst*> &Invokes) { // Scan all of the uses and see if the live range is live across an unwind // edge. If we find a use live across an invoke edge, create an alloca // and spill the value. - std::set<InvokeInst*> InvokesWithStoreInserted; // Find all of the blocks that this value is live in. std::set<BasicBlock*> LiveBBs; @@ -430,6 +426,7 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) { // Create an alloca for the incoming jump buffer ptr and the new jump buffer // that needs to be restored on all exits from the function. This is an // alloca because the value needs to be live across invokes. + const TargetLowering *TLI = TM ? TM->getTargetLowering() : 0; unsigned Align = TLI ? TLI->getJumpBufAlignment() : 0; AllocaInst *JmpBuf = new AllocaInst(JBLinkTy, 0, Align, diff --git a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp index 955b853..2d2a8a5 100644 --- a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp @@ -66,6 +66,18 @@ namespace { BasicBlock* OrigBlock, BasicBlock* Default); unsigned Clusterify(CaseVector& Cases, SwitchInst *SI); }; + + /// The comparison function for sorting the switch case values in the vector. + /// WARNING: Case ranges should be disjoint! + struct CaseCmp { + bool operator () (const LowerSwitch::CaseRange& C1, + const LowerSwitch::CaseRange& C2) { + + const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low); + const ConstantInt* CI2 = cast<const ConstantInt>(C2.High); + return CI1->getValue().slt(CI2->getValue()); + } + }; } char LowerSwitch::ID = 0; @@ -147,7 +159,7 @@ BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, Function::iterator FI = OrigBlock; F->getBasicBlockList().insert(++FI, NewNode); - ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_ULT, + ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT, Val, Pivot.Low, "Pivot"); NewNode->getInstList().push_back(Comp); BranchInst::Create(LBranch, RBranch, Comp, NewNode); @@ -222,34 +234,40 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val, // Clusterify - Transform simple list of Cases into list of CaseRange's unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) { - - IntegersSubsetToBB TheClusterifier; + unsigned numCmps = 0; // Start with "simple" cases - for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); - i != e; ++i) { - BasicBlock *SuccBB = i.getCaseSuccessor(); - IntegersSubset CaseRanges = i.getCaseValueEx(); - TheClusterifier.add(CaseRanges, SuccBB); - } - - TheClusterifier.optimize(); + for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) + Cases.push_back(CaseRange(i.getCaseValue(), i.getCaseValue(), + i.getCaseSuccessor())); - size_t numCmps = 0; - for (IntegersSubsetToBB::RangeIterator i = TheClusterifier.begin(), - e = TheClusterifier.end(); i != e; ++i, ++numCmps) { - IntegersSubsetToBB::Cluster &C = *i; - - // FIXME: Currently work with ConstantInt based numbers. - // Changing it to APInt based is a pretty heavy for this commit. - Cases.push_back(CaseRange(C.first.getLow().toConstantInt(), - C.first.getHigh().toConstantInt(), C.second)); - if (C.first.isSingleNumber()) + std::sort(Cases.begin(), Cases.end(), CaseCmp()); + + // Merge case into clusters + if (Cases.size()>=2) + for (CaseItr I=Cases.begin(), J=llvm::next(Cases.begin()); J!=Cases.end(); ) { + int64_t nextValue = cast<ConstantInt>(J->Low)->getSExtValue(); + int64_t currentValue = cast<ConstantInt>(I->High)->getSExtValue(); + BasicBlock* nextBB = J->BB; + BasicBlock* currentBB = I->BB; + + // If the two neighboring cases go to the same destination, merge them + // into a single case. + if ((nextValue-currentValue==1) && (currentBB == nextBB)) { + I->High = J->High; + J = Cases.erase(J); + } else { + I = J++; + } + } + + for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) { + if (I->Low != I->High) // A range counts double, since it requires two compares. ++numCmps; } - return numCmps; + return numCmps; } // processSwitchInst - Replace the specified switch instruction with a sequence diff --git a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp index 3716f58..c3704531 100644 --- a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp +++ b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp @@ -53,7 +53,7 @@ namespace { } bool runOnModule(Module &M) { - static const char *metaNames[] = { + static const char *const metaNames[] = { // See http://en.wikipedia.org/wiki/Metasyntactic_variable "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge", "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam" diff --git a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp index d090b48..ff6e6f9 100644 --- a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -62,3 +63,20 @@ void llvm::appendToGlobalCtors(Module &M, Function *F, int Priority) { void llvm::appendToGlobalDtors(Module &M, Function *F, int Priority) { appendToGlobalArray("llvm.global_dtors", M, F, Priority); } + +GlobalVariable * +llvm::collectUsedGlobalVariables(Module &M, SmallPtrSet<GlobalValue *, 8> &Set, + bool CompilerUsed) { + const char *Name = CompilerUsed ? "llvm.compiler.used" : "llvm.used"; + GlobalVariable *GV = M.getGlobalVariable(Name); + if (!GV || !GV->hasInitializer()) + return GV; + + const ConstantArray *Init = cast<ConstantArray>(GV->getInitializer()); + for (unsigned I = 0, E = Init->getNumOperands(); I != E; ++I) { + Value *Op = Init->getOperand(I); + GlobalValue *G = cast<GlobalValue>(Op->stripPointerCastsNoFollowAliases()); + Set.insert(G); + } + return GV; +} diff --git a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index de335ec..8f6eee3 100644 --- a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -27,8 +27,8 @@ #define DEBUG_TYPE "mem2reg" #include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/Hashing.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -56,36 +56,13 @@ STATISTIC(NumSingleStore, "Number of alloca's promoted with a single store"); STATISTIC(NumDeadAlloca, "Number of dead alloca's removed"); STATISTIC(NumPHIInsert, "Number of PHI nodes inserted"); -namespace llvm { -template<> -struct DenseMapInfo<std::pair<BasicBlock*, unsigned> > { - typedef std::pair<BasicBlock*, unsigned> EltTy; - static inline EltTy getEmptyKey() { - return EltTy(reinterpret_cast<BasicBlock*>(-1), ~0U); - } - static inline EltTy getTombstoneKey() { - return EltTy(reinterpret_cast<BasicBlock*>(-2), 0U); - } - static unsigned getHashValue(const std::pair<BasicBlock*, unsigned> &Val) { - using llvm::hash_value; - return static_cast<unsigned>(hash_value(Val)); - } - static bool isEqual(const EltTy &LHS, const EltTy &RHS) { - return LHS == RHS; - } -}; -} - -/// isAllocaPromotable - Return true if this alloca is legal for promotion. -/// This is true if there are only loads and stores to the alloca. -/// bool llvm::isAllocaPromotable(const AllocaInst *AI) { // FIXME: If the memory unit is of pointer or integer type, we can permit // assignments to subsections of the memory unit. // Only allow direct and non-volatile loads and stores... for (Value::const_use_iterator UI = AI->use_begin(), UE = AI->use_end(); - UI != UE; ++UI) { // Loop over all of the uses of the alloca + UI != UE; ++UI) { // Loop over all of the uses of the alloca const User *U = *UI; if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { // Note that atomic loads can be transformed; atomic semantics do @@ -94,7 +71,7 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) { return false; } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) { if (SI->getOperand(0) == AI) - return false; // Don't allow a store OF the AI, only INTO the AI. + return false; // Don't allow a store OF the AI, only INTO the AI. // Note that atomic stores can be transformed; atomic semantics do // not have any meaning for a local alloca. if (SI->isVolatile()) @@ -124,243 +101,217 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) { } namespace { - struct AllocaInfo; - - // Data package used by RenamePass() - class RenamePassData { - public: - typedef std::vector<Value *> ValVector; - - RenamePassData() : BB(NULL), Pred(NULL), Values() {} - RenamePassData(BasicBlock *B, BasicBlock *P, - const ValVector &V) : BB(B), Pred(P), Values(V) {} - BasicBlock *BB; - BasicBlock *Pred; - ValVector Values; - - void swap(RenamePassData &RHS) { - std::swap(BB, RHS.BB); - std::swap(Pred, RHS.Pred); - Values.swap(RHS.Values); + +struct AllocaInfo { + SmallVector<BasicBlock *, 32> DefiningBlocks; + SmallVector<BasicBlock *, 32> UsingBlocks; + + StoreInst *OnlyStore; + BasicBlock *OnlyBlock; + bool OnlyUsedInOneBlock; + + Value *AllocaPointerVal; + DbgDeclareInst *DbgDeclare; + + void clear() { + DefiningBlocks.clear(); + UsingBlocks.clear(); + OnlyStore = 0; + OnlyBlock = 0; + OnlyUsedInOneBlock = true; + AllocaPointerVal = 0; + DbgDeclare = 0; + } + + /// Scan the uses of the specified alloca, filling in the AllocaInfo used + /// by the rest of the pass to reason about the uses of this alloca. + void AnalyzeAlloca(AllocaInst *AI) { + clear(); + + // As we scan the uses of the alloca instruction, keep track of stores, + // and decide whether all of the loads and stores to the alloca are within + // the same basic block. + for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); + UI != E;) { + Instruction *User = cast<Instruction>(*UI++); + + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + // Remember the basic blocks which define new values for the alloca + DefiningBlocks.push_back(SI->getParent()); + AllocaPointerVal = SI->getOperand(0); + OnlyStore = SI; + } else { + LoadInst *LI = cast<LoadInst>(User); + // Otherwise it must be a load instruction, keep track of variable + // reads. + UsingBlocks.push_back(LI->getParent()); + AllocaPointerVal = LI; + } + + if (OnlyUsedInOneBlock) { + if (OnlyBlock == 0) + OnlyBlock = User->getParent(); + else if (OnlyBlock != User->getParent()) + OnlyUsedInOneBlock = false; + } } - }; - - /// LargeBlockInfo - This assigns and keeps a per-bb relative ordering of - /// load/store instructions in the block that directly load or store an alloca. + + DbgDeclare = FindAllocaDbgDeclare(AI); + } +}; + +// Data package used by RenamePass() +class RenamePassData { +public: + typedef std::vector<Value *> ValVector; + + RenamePassData() : BB(NULL), Pred(NULL), Values() {} + RenamePassData(BasicBlock *B, BasicBlock *P, const ValVector &V) + : BB(B), Pred(P), Values(V) {} + BasicBlock *BB; + BasicBlock *Pred; + ValVector Values; + + void swap(RenamePassData &RHS) { + std::swap(BB, RHS.BB); + std::swap(Pred, RHS.Pred); + Values.swap(RHS.Values); + } +}; + +/// \brief This assigns and keeps a per-bb relative ordering of load/store +/// instructions in the block that directly load or store an alloca. +/// +/// This functionality is important because it avoids scanning large basic +/// blocks multiple times when promoting many allocas in the same block. +class LargeBlockInfo { + /// \brief For each instruction that we track, keep the index of the + /// instruction. /// - /// This functionality is important because it avoids scanning large basic - /// blocks multiple times when promoting many allocas in the same block. - class LargeBlockInfo { - /// InstNumbers - For each instruction that we track, keep the index of the - /// instruction. The index starts out as the number of the instruction from - /// the start of the block. - DenseMap<const Instruction *, unsigned> InstNumbers; - public: - - /// isInterestingInstruction - This code only looks at accesses to allocas. - static bool isInterestingInstruction(const Instruction *I) { - return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) || - (isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1))); - } - - /// getInstructionIndex - Get or calculate the index of the specified - /// instruction. - unsigned getInstructionIndex(const Instruction *I) { - assert(isInterestingInstruction(I) && - "Not a load/store to/from an alloca?"); - - // If we already have this instruction number, return it. - DenseMap<const Instruction *, unsigned>::iterator It = InstNumbers.find(I); - if (It != InstNumbers.end()) return It->second; - - // Scan the whole block to get the instruction. This accumulates - // information for every interesting instruction in the block, in order to - // avoid gratuitus rescans. - const BasicBlock *BB = I->getParent(); - unsigned InstNo = 0; - for (BasicBlock::const_iterator BBI = BB->begin(), E = BB->end(); - BBI != E; ++BBI) - if (isInterestingInstruction(BBI)) - InstNumbers[BBI] = InstNo++; - It = InstNumbers.find(I); - - assert(It != InstNumbers.end() && "Didn't insert instruction?"); + /// The index starts out as the number of the instruction from the start of + /// the block. + DenseMap<const Instruction *, unsigned> InstNumbers; + +public: + + /// This code only looks at accesses to allocas. + static bool isInterestingInstruction(const Instruction *I) { + return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) || + (isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1))); + } + + /// Get or calculate the index of the specified instruction. + unsigned getInstructionIndex(const Instruction *I) { + assert(isInterestingInstruction(I) && + "Not a load/store to/from an alloca?"); + + // If we already have this instruction number, return it. + DenseMap<const Instruction *, unsigned>::iterator It = InstNumbers.find(I); + if (It != InstNumbers.end()) return It->second; - } - - void deleteValue(const Instruction *I) { - InstNumbers.erase(I); - } - - void clear() { - InstNumbers.clear(); - } - }; - - struct PromoteMem2Reg { - /// Allocas - The alloca instructions being promoted. - /// - std::vector<AllocaInst*> Allocas; - DominatorTree &DT; - DIBuilder *DIB; - - /// AST - An AliasSetTracker object to update. If null, don't update it. - /// - AliasSetTracker *AST; - - /// AllocaLookup - Reverse mapping of Allocas. - /// - DenseMap<AllocaInst*, unsigned> AllocaLookup; - - /// NewPhiNodes - The PhiNodes we're adding. That map is used to simplify - /// some Phi nodes as we iterate over it, so it should have deterministic - /// iterators. We could use a MapVector, but since we already maintain a - /// map from BasicBlock* to a stable numbering (BBNumbers), the DenseMap is - /// more efficient (also supports removal). - /// - DenseMap<std::pair<unsigned, unsigned>, PHINode*> NewPhiNodes; - - /// PhiToAllocaMap - For each PHI node, keep track of which entry in Allocas - /// it corresponds to. - DenseMap<PHINode*, unsigned> PhiToAllocaMap; - - /// PointerAllocaValues - If we are updating an AliasSetTracker, then for - /// each alloca that is of pointer type, we keep track of what to copyValue - /// to the inserted PHI nodes here. - /// - std::vector<Value*> PointerAllocaValues; - - /// AllocaDbgDeclares - For each alloca, we keep track of the dbg.declare - /// intrinsic that describes it, if any, so that we can convert it to a - /// dbg.value intrinsic if the alloca gets promoted. - SmallVector<DbgDeclareInst*, 8> AllocaDbgDeclares; - - /// Visited - The set of basic blocks the renamer has already visited. - /// - SmallPtrSet<BasicBlock*, 16> Visited; - - /// BBNumbers - Contains a stable numbering of basic blocks to avoid - /// non-determinstic behavior. - DenseMap<BasicBlock*, unsigned> BBNumbers; - - /// DomLevels - Maps DomTreeNodes to their level in the dominator tree. - DenseMap<DomTreeNode*, unsigned> DomLevels; - - /// BBNumPreds - Lazily compute the number of predecessors a block has. - DenseMap<const BasicBlock*, unsigned> BBNumPreds; - public: - PromoteMem2Reg(const std::vector<AllocaInst*> &A, DominatorTree &dt, - AliasSetTracker *ast) - : Allocas(A), DT(dt), DIB(0), AST(ast) {} - ~PromoteMem2Reg() { - delete DIB; - } - void run(); + // Scan the whole block to get the instruction. This accumulates + // information for every interesting instruction in the block, in order to + // avoid gratuitus rescans. + const BasicBlock *BB = I->getParent(); + unsigned InstNo = 0; + for (BasicBlock::const_iterator BBI = BB->begin(), E = BB->end(); BBI != E; + ++BBI) + if (isInterestingInstruction(BBI)) + InstNumbers[BBI] = InstNo++; + It = InstNumbers.find(I); + + assert(It != InstNumbers.end() && "Didn't insert instruction?"); + return It->second; + } - /// dominates - Return true if BB1 dominates BB2 using the DominatorTree. - /// - bool dominates(BasicBlock *BB1, BasicBlock *BB2) const { - return DT.dominates(BB1, BB2); - } + void deleteValue(const Instruction *I) { InstNumbers.erase(I); } - private: - void RemoveFromAllocasList(unsigned &AllocaIdx) { - Allocas[AllocaIdx] = Allocas.back(); - Allocas.pop_back(); - --AllocaIdx; - } + void clear() { InstNumbers.clear(); } +}; - unsigned getNumPreds(const BasicBlock *BB) { - unsigned &NP = BBNumPreds[BB]; - if (NP == 0) - NP = std::distance(pred_begin(BB), pred_end(BB))+1; - return NP-1; - } +struct PromoteMem2Reg { + /// The alloca instructions being promoted. + std::vector<AllocaInst *> Allocas; + DominatorTree &DT; + DIBuilder DIB; - void DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum, - AllocaInfo &Info); - void ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info, - const SmallPtrSet<BasicBlock*, 32> &DefBlocks, - SmallPtrSet<BasicBlock*, 32> &LiveInBlocks); - - void RewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, - LargeBlockInfo &LBI); - void PromoteSingleBlockAlloca(AllocaInst *AI, AllocaInfo &Info, - LargeBlockInfo &LBI); - - void RenamePass(BasicBlock *BB, BasicBlock *Pred, - RenamePassData::ValVector &IncVals, - std::vector<RenamePassData> &Worklist); - bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version); - }; - - struct AllocaInfo { - SmallVector<BasicBlock*, 32> DefiningBlocks; - SmallVector<BasicBlock*, 32> UsingBlocks; - - StoreInst *OnlyStore; - BasicBlock *OnlyBlock; - bool OnlyUsedInOneBlock; - - Value *AllocaPointerVal; - DbgDeclareInst *DbgDeclare; - - void clear() { - DefiningBlocks.clear(); - UsingBlocks.clear(); - OnlyStore = 0; - OnlyBlock = 0; - OnlyUsedInOneBlock = true; - AllocaPointerVal = 0; - DbgDeclare = 0; - } - - /// AnalyzeAlloca - Scan the uses of the specified alloca, filling in our - /// ivars. - void AnalyzeAlloca(AllocaInst *AI) { - clear(); - - // As we scan the uses of the alloca instruction, keep track of stores, - // and decide whether all of the loads and stores to the alloca are within - // the same basic block. - for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); - UI != E;) { - Instruction *User = cast<Instruction>(*UI++); - - if (StoreInst *SI = dyn_cast<StoreInst>(User)) { - // Remember the basic blocks which define new values for the alloca - DefiningBlocks.push_back(SI->getParent()); - AllocaPointerVal = SI->getOperand(0); - OnlyStore = SI; - } else { - LoadInst *LI = cast<LoadInst>(User); - // Otherwise it must be a load instruction, keep track of variable - // reads. - UsingBlocks.push_back(LI->getParent()); - AllocaPointerVal = LI; - } - - if (OnlyUsedInOneBlock) { - if (OnlyBlock == 0) - OnlyBlock = User->getParent(); - else if (OnlyBlock != User->getParent()) - OnlyUsedInOneBlock = false; - } - } - - DbgDeclare = FindAllocaDbgDeclare(AI); - } - }; + /// An AliasSetTracker object to update. If null, don't update it. + AliasSetTracker *AST; - typedef std::pair<DomTreeNode*, unsigned> DomTreeNodePair; + /// Reverse mapping of Allocas. + DenseMap<AllocaInst *, unsigned> AllocaLookup; - struct DomTreeNodeCompare { - bool operator()(const DomTreeNodePair &LHS, const DomTreeNodePair &RHS) { - return LHS.second < RHS.second; - } - }; -} // end of anonymous namespace + /// \brief The PhiNodes we're adding. + /// + /// That map is used to simplify some Phi nodes as we iterate over it, so + /// it should have deterministic iterators. We could use a MapVector, but + /// since we already maintain a map from BasicBlock* to a stable numbering + /// (BBNumbers), the DenseMap is more efficient (also supports removal). + DenseMap<std::pair<unsigned, unsigned>, PHINode *> NewPhiNodes; + + /// For each PHI node, keep track of which entry in Allocas it corresponds + /// to. + DenseMap<PHINode *, unsigned> PhiToAllocaMap; + + /// If we are updating an AliasSetTracker, then for each alloca that is of + /// pointer type, we keep track of what to copyValue to the inserted PHI + /// nodes here. + std::vector<Value *> PointerAllocaValues; + + /// For each alloca, we keep track of the dbg.declare intrinsic that + /// describes it, if any, so that we can convert it to a dbg.value + /// intrinsic if the alloca gets promoted. + SmallVector<DbgDeclareInst *, 8> AllocaDbgDeclares; + + /// The set of basic blocks the renamer has already visited. + /// + SmallPtrSet<BasicBlock *, 16> Visited; + + /// Contains a stable numbering of basic blocks to avoid non-determinstic + /// behavior. + DenseMap<BasicBlock *, unsigned> BBNumbers; + + /// Maps DomTreeNodes to their level in the dominator tree. + DenseMap<DomTreeNode *, unsigned> DomLevels; + + /// Lazily compute the number of predecessors a block has. + DenseMap<const BasicBlock *, unsigned> BBNumPreds; + +public: + PromoteMem2Reg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT, + AliasSetTracker *AST) + : Allocas(Allocas.begin(), Allocas.end()), DT(DT), + DIB(*DT.getRoot()->getParent()->getParent()), AST(AST) {} + + void run(); + +private: + void RemoveFromAllocasList(unsigned &AllocaIdx) { + Allocas[AllocaIdx] = Allocas.back(); + Allocas.pop_back(); + --AllocaIdx; + } + + unsigned getNumPreds(const BasicBlock *BB) { + unsigned &NP = BBNumPreds[BB]; + if (NP == 0) + NP = std::distance(pred_begin(BB), pred_end(BB)) + 1; + return NP - 1; + } + + void DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum, + AllocaInfo &Info); + void ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info, + const SmallPtrSet<BasicBlock *, 32> &DefBlocks, + SmallPtrSet<BasicBlock *, 32> &LiveInBlocks); + void RenamePass(BasicBlock *BB, BasicBlock *Pred, + RenamePassData::ValVector &IncVals, + std::vector<RenamePassData> &Worklist); + bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version); +}; + +} // end of anonymous namespace static void removeLifetimeIntrinsicUsers(AllocaInst *AI) { // Knowing that this alloca is promotable, we know that it's safe to kill all @@ -388,10 +339,191 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) { } } +/// \brief Rewrite as many loads as possible given a single store. +/// +/// When there is only a single store, we can use the domtree to trivially +/// replace all of the dominated loads with the stored value. Do so, and return +/// true if this has successfully promoted the alloca entirely. If this returns +/// false there were some loads which were not dominated by the single store +/// and thus must be phi-ed with undef. We fall back to the standard alloca +/// promotion algorithm in that case. +static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, + LargeBlockInfo &LBI, + DominatorTree &DT, + AliasSetTracker *AST) { + StoreInst *OnlyStore = Info.OnlyStore; + bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0)); + BasicBlock *StoreBB = OnlyStore->getParent(); + int StoreIndex = -1; + + // Clear out UsingBlocks. We will reconstruct it here if needed. + Info.UsingBlocks.clear(); + + for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) { + Instruction *UserInst = cast<Instruction>(*UI++); + if (!isa<LoadInst>(UserInst)) { + assert(UserInst == OnlyStore && "Should only have load/stores"); + continue; + } + LoadInst *LI = cast<LoadInst>(UserInst); + + // Okay, if we have a load from the alloca, we want to replace it with the + // only value stored to the alloca. We can do this if the value is + // dominated by the store. If not, we use the rest of the mem2reg machinery + // to insert the phi nodes as needed. + if (!StoringGlobalVal) { // Non-instructions are always dominated. + if (LI->getParent() == StoreBB) { + // If we have a use that is in the same block as the store, compare the + // indices of the two instructions to see which one came first. If the + // load came before the store, we can't handle it. + if (StoreIndex == -1) + StoreIndex = LBI.getInstructionIndex(OnlyStore); + + if (unsigned(StoreIndex) > LBI.getInstructionIndex(LI)) { + // Can't handle this load, bail out. + Info.UsingBlocks.push_back(StoreBB); + continue; + } + + } else if (LI->getParent() != StoreBB && + !DT.dominates(StoreBB, LI->getParent())) { + // If the load and store are in different blocks, use BB dominance to + // check their relationships. If the store doesn't dom the use, bail + // out. + Info.UsingBlocks.push_back(LI->getParent()); + continue; + } + } + + // Otherwise, we *can* safely rewrite this load. + Value *ReplVal = OnlyStore->getOperand(0); + // If the replacement value is the load, this must occur in unreachable + // code. + if (ReplVal == LI) + ReplVal = UndefValue::get(LI->getType()); + LI->replaceAllUsesWith(ReplVal); + if (AST && LI->getType()->isPointerTy()) + AST->deleteValue(LI); + LI->eraseFromParent(); + LBI.deleteValue(LI); + } + + // Finally, after the scan, check to see if the store is all that is left. + if (!Info.UsingBlocks.empty()) + return false; // If not, we'll have to fall back for the remainder. + + // Record debuginfo for the store and remove the declaration's + // debuginfo. + if (DbgDeclareInst *DDI = Info.DbgDeclare) { + DIBuilder DIB(*AI->getParent()->getParent()->getParent()); + ConvertDebugDeclareToDebugValue(DDI, Info.OnlyStore, DIB); + DDI->eraseFromParent(); + LBI.deleteValue(DDI); + } + // Remove the (now dead) store and alloca. + Info.OnlyStore->eraseFromParent(); + LBI.deleteValue(Info.OnlyStore); + + if (AST) + AST->deleteValue(AI); + AI->eraseFromParent(); + LBI.deleteValue(AI); + return true; +} + +/// Many allocas are only used within a single basic block. If this is the +/// case, avoid traversing the CFG and inserting a lot of potentially useless +/// PHI nodes by just performing a single linear pass over the basic block +/// using the Alloca. +/// +/// If we cannot promote this alloca (because it is read before it is written), +/// return true. This is necessary in cases where, due to control flow, the +/// alloca is potentially undefined on some control flow paths. e.g. code like +/// this is potentially correct: +/// +/// for (...) { if (c) { A = undef; undef = B; } } +/// +/// ... so long as A is not used before undef is set. +static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, + LargeBlockInfo &LBI, + AliasSetTracker *AST) { + // The trickiest case to handle is when we have large blocks. Because of this, + // this code is optimized assuming that large blocks happen. This does not + // significantly pessimize the small block case. This uses LargeBlockInfo to + // make it efficient to get the index of various operations in the block. + + // Walk the use-def list of the alloca, getting the locations of all stores. + typedef SmallVector<std::pair<unsigned, StoreInst *>, 64> StoresByIndexTy; + StoresByIndexTy StoresByIndex; + + for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E; + ++UI) + if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) + StoresByIndex.push_back(std::make_pair(LBI.getInstructionIndex(SI), SI)); + + // Sort the stores by their index, making it efficient to do a lookup with a + // binary search. + std::sort(StoresByIndex.begin(), StoresByIndex.end(), less_first()); + + // Walk all of the loads from this alloca, replacing them with the nearest + // store above them, if any. + for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) { + LoadInst *LI = dyn_cast<LoadInst>(*UI++); + if (!LI) + continue; + + unsigned LoadIdx = LBI.getInstructionIndex(LI); + + // Find the nearest store that has a lower index than this load. + StoresByIndexTy::iterator I = + std::lower_bound(StoresByIndex.begin(), StoresByIndex.end(), + std::make_pair(LoadIdx, static_cast<StoreInst *>(0)), + less_first()); + + if (I == StoresByIndex.begin()) + // If there is no store before this load, the load takes the undef value. + LI->replaceAllUsesWith(UndefValue::get(LI->getType())); + else + // Otherwise, there was a store before this load, the load takes its value. + LI->replaceAllUsesWith(llvm::prior(I)->second->getOperand(0)); + + if (AST && LI->getType()->isPointerTy()) + AST->deleteValue(LI); + LI->eraseFromParent(); + LBI.deleteValue(LI); + } + + // Remove the (now dead) stores and alloca. + while (!AI->use_empty()) { + StoreInst *SI = cast<StoreInst>(AI->use_back()); + // Record debuginfo for the store before removing it. + if (DbgDeclareInst *DDI = Info.DbgDeclare) { + DIBuilder DIB(*AI->getParent()->getParent()->getParent()); + ConvertDebugDeclareToDebugValue(DDI, SI, DIB); + } + SI->eraseFromParent(); + LBI.deleteValue(SI); + } + + if (AST) + AST->deleteValue(AI); + AI->eraseFromParent(); + LBI.deleteValue(AI); + + // The alloca's debuginfo can be removed as well. + if (DbgDeclareInst *DDI = Info.DbgDeclare) { + DDI->eraseFromParent(); + LBI.deleteValue(DDI); + } + + ++NumLocalPromoted; +} + void PromoteMem2Reg::run() { Function &F = *DT.getRoot()->getParent(); - if (AST) PointerAllocaValues.resize(Allocas.size()); + if (AST) + PointerAllocaValues.resize(Allocas.size()); AllocaDbgDeclares.resize(Allocas.size()); AllocaInfo Info; @@ -400,8 +532,7 @@ void PromoteMem2Reg::run() { for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) { AllocaInst *AI = Allocas[AllocaNum]; - assert(isAllocaPromotable(AI) && - "Cannot promote non-promotable alloca!"); + assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!"); assert(AI->getParent()->getParent() == &F && "All allocas should be in the same function, which is same as DF!"); @@ -409,7 +540,8 @@ void PromoteMem2Reg::run() { if (AI->use_empty()) { // If there are no uses of the alloca, just delete it now. - if (AST) AST->deleteValue(AI); + if (AST) + AST->deleteValue(AI); AI->eraseFromParent(); // Remove the alloca from the Allocas list, since it has been processed @@ -417,7 +549,7 @@ void PromoteMem2Reg::run() { ++NumDeadAlloca; continue; } - + // Calculate the set of read and write-locations for each alloca. This is // analogous to finding the 'uses' and 'definitions' of each variable. Info.AnalyzeAlloca(AI); @@ -425,75 +557,27 @@ void PromoteMem2Reg::run() { // If there is only a single store to this value, replace any loads of // it that are directly dominated by the definition with the value stored. if (Info.DefiningBlocks.size() == 1) { - RewriteSingleStoreAlloca(AI, Info, LBI); - - // Finally, after the scan, check to see if the store is all that is left. - if (Info.UsingBlocks.empty()) { - // Record debuginfo for the store and remove the declaration's - // debuginfo. - if (DbgDeclareInst *DDI = Info.DbgDeclare) { - if (!DIB) - DIB = new DIBuilder(*DDI->getParent()->getParent()->getParent()); - ConvertDebugDeclareToDebugValue(DDI, Info.OnlyStore, *DIB); - DDI->eraseFromParent(); - } - // Remove the (now dead) store and alloca. - Info.OnlyStore->eraseFromParent(); - LBI.deleteValue(Info.OnlyStore); - - if (AST) AST->deleteValue(AI); - AI->eraseFromParent(); - LBI.deleteValue(AI); - + if (rewriteSingleStoreAlloca(AI, Info, LBI, DT, AST)) { // The alloca has been processed, move on. RemoveFromAllocasList(AllocaNum); - ++NumSingleStore; continue; } } - + // If the alloca is only read and written in one basic block, just perform a // linear sweep over the block to eliminate it. if (Info.OnlyUsedInOneBlock) { - PromoteSingleBlockAlloca(AI, Info, LBI); - - // Finally, after the scan, check to see if the stores are all that is - // left. - if (Info.UsingBlocks.empty()) { - - // Remove the (now dead) stores and alloca. - while (!AI->use_empty()) { - StoreInst *SI = cast<StoreInst>(AI->use_back()); - // Record debuginfo for the store before removing it. - if (DbgDeclareInst *DDI = Info.DbgDeclare) { - if (!DIB) - DIB = new DIBuilder(*SI->getParent()->getParent()->getParent()); - ConvertDebugDeclareToDebugValue(DDI, SI, *DIB); - } - SI->eraseFromParent(); - LBI.deleteValue(SI); - } - - if (AST) AST->deleteValue(AI); - AI->eraseFromParent(); - LBI.deleteValue(AI); - - // The alloca has been processed, move on. - RemoveFromAllocasList(AllocaNum); - - // The alloca's debuginfo can be removed as well. - if (DbgDeclareInst *DDI = Info.DbgDeclare) - DDI->eraseFromParent(); + promoteSingleBlockAlloca(AI, Info, LBI, AST); - ++NumLocalPromoted; - continue; - } + // The alloca has been processed, move on. + RemoveFromAllocasList(AllocaNum); + continue; } // If we haven't computed dominator tree levels, do so now. if (DomLevels.empty()) { - SmallVector<DomTreeNode*, 32> Worklist; + SmallVector<DomTreeNode *, 32> Worklist; DomTreeNode *Root = DT.getRootNode(); DomLevels[Root] = 0; @@ -522,10 +606,11 @@ void PromoteMem2Reg::run() { // stored into the alloca. if (AST) PointerAllocaValues[AllocaNum] = Info.AllocaPointerVal; - + // Remember the dbg.declare intrinsic describing this alloca, if any. - if (Info.DbgDeclare) AllocaDbgDeclares[AllocaNum] = Info.DbgDeclare; - + if (Info.DbgDeclare) + AllocaDbgDeclares[AllocaNum] = Info.DbgDeclare; + // Keep the reverse mapping of the 'Allocas' array for the rename pass. AllocaLookup[Allocas[AllocaNum]] = AllocaNum; @@ -540,8 +625,7 @@ void PromoteMem2Reg::run() { return; // All of the allocas must have been trivial! LBI.clear(); - - + // Set the incoming values for the basic block to be null values for all of // the alloca's. We do this in case there is a load of a value that has not // been stored yet. In this case, it will get this null value. @@ -562,7 +646,7 @@ void PromoteMem2Reg::run() { // RenamePass may add new worklist entries. RenamePass(RPD.BB, RPD.Pred, RPD.Values, RenamePassWorkList); } while (!RenamePassWorkList.empty()); - + // The renamer uses the Visited set to avoid infinite loops. Clear it now. Visited.clear(); @@ -575,7 +659,8 @@ void PromoteMem2Reg::run() { // tree. Just delete the users now. if (!A->use_empty()) A->replaceAllUsesWith(UndefValue::get(A->getType())); - if (AST) AST->deleteValue(A); + if (AST) + AST->deleteValue(A); A->eraseFromParent(); } @@ -591,13 +676,15 @@ void PromoteMem2Reg::run() { bool EliminatedAPHI = true; while (EliminatedAPHI) { EliminatedAPHI = false; - + // Iterating over NewPhiNodes is deterministic, so it is safe to try to // simplify and RAUW them as we go. If it was not, we could add uses to // the values we replace with in a non deterministic order, thus creating // non deterministic def->use chains. - for (DenseMap<std::pair<unsigned, unsigned>, PHINode*>::iterator I = - NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E;) { + for (DenseMap<std::pair<unsigned, unsigned>, PHINode *>::iterator + I = NewPhiNodes.begin(), + E = NewPhiNodes.end(); + I != E;) { PHINode *PN = I->second; // If this PHI node merges one value and/or undefs, get the value. @@ -613,15 +700,17 @@ void PromoteMem2Reg::run() { ++I; } } - + // At this point, the renamer has added entries to PHI nodes for all reachable // code. Unfortunately, there may be unreachable blocks which the renamer // hasn't traversed. If this is the case, the PHI nodes may not // have incoming values for all predecessors. Loop over all PHI nodes we have // created, inserting undef values if they are missing any incoming values. // - for (DenseMap<std::pair<unsigned, unsigned>, PHINode*>::iterator I = - NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E; ++I) { + for (DenseMap<std::pair<unsigned, unsigned>, PHINode *>::iterator + I = NewPhiNodes.begin(), + E = NewPhiNodes.end(); + I != E; ++I) { // We want to do this once per basic block. As such, only process a block // when we find the PHI that is the first entry in the block. PHINode *SomePHI = I->second; @@ -636,21 +725,20 @@ void PromoteMem2Reg::run() { continue; // Get the preds for BB. - SmallVector<BasicBlock*, 16> Preds(pred_begin(BB), pred_end(BB)); - + SmallVector<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB)); + // Ok, now we know that all of the PHI nodes are missing entries for some // basic blocks. Start by sorting the incoming predecessors for efficient // access. std::sort(Preds.begin(), Preds.end()); - + // Now we loop through all BB's which have entries in SomePHI and remove // them from the Preds list. for (unsigned i = 0, e = SomePHI->getNumIncomingValues(); i != e; ++i) { // Do a log(n) search of the Preds list for the entry we want. - SmallVector<BasicBlock*, 16>::iterator EntIt = - std::lower_bound(Preds.begin(), Preds.end(), - SomePHI->getIncomingBlock(i)); - assert(EntIt != Preds.end() && *EntIt == SomePHI->getIncomingBlock(i)&& + SmallVectorImpl<BasicBlock *>::iterator EntIt = std::lower_bound( + Preds.begin(), Preds.end(), SomePHI->getIncomingBlock(i)); + assert(EntIt != Preds.end() && *EntIt == SomePHI->getIncomingBlock(i) && "PHI node has entry for a block which is not a predecessor!"); // Remove the entry @@ -670,39 +758,41 @@ void PromoteMem2Reg::run() { SomePHI->addIncoming(UndefVal, Preds[pred]); } } - + NewPhiNodes.clear(); } +/// \brief Determine which blocks the value is live in. +/// +/// These are blocks which lead to uses. Knowing this allows us to avoid +/// inserting PHI nodes into blocks which don't lead to uses (thus, the +/// inserted phi nodes would be dead). +void PromoteMem2Reg::ComputeLiveInBlocks( + AllocaInst *AI, AllocaInfo &Info, + const SmallPtrSet<BasicBlock *, 32> &DefBlocks, + SmallPtrSet<BasicBlock *, 32> &LiveInBlocks) { -/// ComputeLiveInBlocks - Determine which blocks the value is live in. These -/// are blocks which lead to uses. Knowing this allows us to avoid inserting -/// PHI nodes into blocks which don't lead to uses (thus, the inserted phi nodes -/// would be dead). -void PromoteMem2Reg:: -ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info, - const SmallPtrSet<BasicBlock*, 32> &DefBlocks, - SmallPtrSet<BasicBlock*, 32> &LiveInBlocks) { - // To determine liveness, we must iterate through the predecessors of blocks // where the def is live. Blocks are added to the worklist if we need to // check their predecessors. Start with all the using blocks. - SmallVector<BasicBlock*, 64> LiveInBlockWorklist(Info.UsingBlocks.begin(), - Info.UsingBlocks.end()); - + SmallVector<BasicBlock *, 64> LiveInBlockWorklist(Info.UsingBlocks.begin(), + Info.UsingBlocks.end()); + // If any of the using blocks is also a definition block, check to see if the // definition occurs before or after the use. If it happens before the use, // the value isn't really live-in. for (unsigned i = 0, e = LiveInBlockWorklist.size(); i != e; ++i) { BasicBlock *BB = LiveInBlockWorklist[i]; - if (!DefBlocks.count(BB)) continue; - + if (!DefBlocks.count(BB)) + continue; + // Okay, this is a block that both uses and defines the value. If the first // reference to the alloca is a def (store), then we know it isn't live-in. - for (BasicBlock::iterator I = BB->begin(); ; ++I) { + for (BasicBlock::iterator I = BB->begin();; ++I) { if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - if (SI->getOperand(1) != AI) continue; - + if (SI->getOperand(1) != AI) + continue; + // We found a store to the alloca before a load. The alloca is not // actually live-in here. LiveInBlockWorklist[i] = LiveInBlockWorklist.back(); @@ -710,73 +800,76 @@ ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info, --i, --e; break; } - + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - if (LI->getOperand(0) != AI) continue; - + if (LI->getOperand(0) != AI) + continue; + // Okay, we found a load before a store to the alloca. It is actually // live into this block. break; } } } - + // Now that we have a set of blocks where the phi is live-in, recursively add // their predecessors until we find the full region the value is live. while (!LiveInBlockWorklist.empty()) { BasicBlock *BB = LiveInBlockWorklist.pop_back_val(); - + // The block really is live in here, insert it into the set. If already in // the set, then it has already been processed. if (!LiveInBlocks.insert(BB)) continue; - + // Since the value is live into BB, it is either defined in a predecessor or // live into it to. Add the preds to the worklist unless they are a // defining block. for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { BasicBlock *P = *PI; - + // The value is not live into a predecessor if it defines the value. if (DefBlocks.count(P)) continue; - + // Otherwise it is, add to the worklist. LiveInBlockWorklist.push_back(P); } } } -/// DetermineInsertionPoint - At this point, we're committed to promoting the -/// alloca using IDF's, and the standard SSA construction algorithm. Determine -/// which blocks need phi nodes and see if we can optimize out some work by -/// avoiding insertion of dead phi nodes. +/// At this point, we're committed to promoting the alloca using IDF's, and the +/// standard SSA construction algorithm. Determine which blocks need phi nodes +/// and see if we can optimize out some work by avoiding insertion of dead phi +/// nodes. void PromoteMem2Reg::DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum, AllocaInfo &Info) { // Unique the set of defining blocks for efficient lookup. - SmallPtrSet<BasicBlock*, 32> DefBlocks; + SmallPtrSet<BasicBlock *, 32> DefBlocks; DefBlocks.insert(Info.DefiningBlocks.begin(), Info.DefiningBlocks.end()); // Determine which blocks the value is live in. These are blocks which lead // to uses. - SmallPtrSet<BasicBlock*, 32> LiveInBlocks; + SmallPtrSet<BasicBlock *, 32> LiveInBlocks; ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks); // Use a priority queue keyed on dominator tree level so that inserted nodes // are handled from the bottom of the dominator tree upwards. + typedef std::pair<DomTreeNode *, unsigned> DomTreeNodePair; typedef std::priority_queue<DomTreeNodePair, SmallVector<DomTreeNodePair, 32>, - DomTreeNodeCompare> IDFPriorityQueue; + less_second> IDFPriorityQueue; IDFPriorityQueue PQ; - for (SmallPtrSet<BasicBlock*, 32>::const_iterator I = DefBlocks.begin(), - E = DefBlocks.end(); I != E; ++I) { + for (SmallPtrSet<BasicBlock *, 32>::const_iterator I = DefBlocks.begin(), + E = DefBlocks.end(); + I != E; ++I) { if (DomTreeNode *Node = DT.getNode(*I)) PQ.push(std::make_pair(Node, DomLevels[Node])); } - SmallVector<std::pair<unsigned, BasicBlock*>, 32> DFBlocks; - SmallPtrSet<DomTreeNode*, 32> Visited; - SmallVector<DomTreeNode*, 32> Worklist; + SmallVector<std::pair<unsigned, BasicBlock *>, 32> DFBlocks; + SmallPtrSet<DomTreeNode *, 32> Visited; + SmallVector<DomTreeNode *, 32> Worklist; while (!PQ.empty()) { DomTreeNodePair RootPair = PQ.top(); PQ.pop(); @@ -836,179 +929,22 @@ void PromoteMem2Reg::DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum, QueuePhiNode(DFBlocks[i].second, AllocaNum, CurrentVersion); } -/// RewriteSingleStoreAlloca - If there is only a single store to this value, -/// replace any loads of it that are directly dominated by the definition with -/// the value stored. -void PromoteMem2Reg::RewriteSingleStoreAlloca(AllocaInst *AI, - AllocaInfo &Info, - LargeBlockInfo &LBI) { - StoreInst *OnlyStore = Info.OnlyStore; - bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0)); - BasicBlock *StoreBB = OnlyStore->getParent(); - int StoreIndex = -1; - - // Clear out UsingBlocks. We will reconstruct it here if needed. - Info.UsingBlocks.clear(); - - for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E; ) { - Instruction *UserInst = cast<Instruction>(*UI++); - if (!isa<LoadInst>(UserInst)) { - assert(UserInst == OnlyStore && "Should only have load/stores"); - continue; - } - LoadInst *LI = cast<LoadInst>(UserInst); - - // Okay, if we have a load from the alloca, we want to replace it with the - // only value stored to the alloca. We can do this if the value is - // dominated by the store. If not, we use the rest of the mem2reg machinery - // to insert the phi nodes as needed. - if (!StoringGlobalVal) { // Non-instructions are always dominated. - if (LI->getParent() == StoreBB) { - // If we have a use that is in the same block as the store, compare the - // indices of the two instructions to see which one came first. If the - // load came before the store, we can't handle it. - if (StoreIndex == -1) - StoreIndex = LBI.getInstructionIndex(OnlyStore); - - if (unsigned(StoreIndex) > LBI.getInstructionIndex(LI)) { - // Can't handle this load, bail out. - Info.UsingBlocks.push_back(StoreBB); - continue; - } - - } else if (LI->getParent() != StoreBB && - !dominates(StoreBB, LI->getParent())) { - // If the load and store are in different blocks, use BB dominance to - // check their relationships. If the store doesn't dom the use, bail - // out. - Info.UsingBlocks.push_back(LI->getParent()); - continue; - } - } - - // Otherwise, we *can* safely rewrite this load. - Value *ReplVal = OnlyStore->getOperand(0); - // If the replacement value is the load, this must occur in unreachable - // code. - if (ReplVal == LI) - ReplVal = UndefValue::get(LI->getType()); - LI->replaceAllUsesWith(ReplVal); - if (AST && LI->getType()->isPointerTy()) - AST->deleteValue(LI); - LI->eraseFromParent(); - LBI.deleteValue(LI); - } -} - -namespace { - -/// StoreIndexSearchPredicate - This is a helper predicate used to search by the -/// first element of a pair. -struct StoreIndexSearchPredicate { - bool operator()(const std::pair<unsigned, StoreInst*> &LHS, - const std::pair<unsigned, StoreInst*> &RHS) { - return LHS.first < RHS.first; - } -}; - -} - -/// PromoteSingleBlockAlloca - Many allocas are only used within a single basic -/// block. If this is the case, avoid traversing the CFG and inserting a lot of -/// potentially useless PHI nodes by just performing a single linear pass over -/// the basic block using the Alloca. -/// -/// If we cannot promote this alloca (because it is read before it is written), -/// return true. This is necessary in cases where, due to control flow, the -/// alloca is potentially undefined on some control flow paths. e.g. code like -/// this is potentially correct: -/// -/// for (...) { if (c) { A = undef; undef = B; } } -/// -/// ... so long as A is not used before undef is set. +/// \brief Queue a phi-node to be added to a basic-block for a specific Alloca. /// -void PromoteMem2Reg::PromoteSingleBlockAlloca(AllocaInst *AI, AllocaInfo &Info, - LargeBlockInfo &LBI) { - // The trickiest case to handle is when we have large blocks. Because of this, - // this code is optimized assuming that large blocks happen. This does not - // significantly pessimize the small block case. This uses LargeBlockInfo to - // make it efficient to get the index of various operations in the block. - - // Clear out UsingBlocks. We will reconstruct it here if needed. - Info.UsingBlocks.clear(); - - // Walk the use-def list of the alloca, getting the locations of all stores. - typedef SmallVector<std::pair<unsigned, StoreInst*>, 64> StoresByIndexTy; - StoresByIndexTy StoresByIndex; - - for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); - UI != E; ++UI) - if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) - StoresByIndex.push_back(std::make_pair(LBI.getInstructionIndex(SI), SI)); - - // If there are no stores to the alloca, just replace any loads with undef. - if (StoresByIndex.empty()) { - for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) - if (LoadInst *LI = dyn_cast<LoadInst>(*UI++)) { - LI->replaceAllUsesWith(UndefValue::get(LI->getType())); - if (AST && LI->getType()->isPointerTy()) - AST->deleteValue(LI); - LBI.deleteValue(LI); - LI->eraseFromParent(); - } - return; - } - - // Sort the stores by their index, making it efficient to do a lookup with a - // binary search. - std::sort(StoresByIndex.begin(), StoresByIndex.end()); - - // Walk all of the loads from this alloca, replacing them with the nearest - // store above them, if any. - for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) { - LoadInst *LI = dyn_cast<LoadInst>(*UI++); - if (!LI) continue; - - unsigned LoadIdx = LBI.getInstructionIndex(LI); - - // Find the nearest store that has a lower than this load. - StoresByIndexTy::iterator I = - std::lower_bound(StoresByIndex.begin(), StoresByIndex.end(), - std::pair<unsigned, StoreInst*>(LoadIdx, static_cast<StoreInst*>(0)), - StoreIndexSearchPredicate()); - - // If there is no store before this load, then we can't promote this load. - if (I == StoresByIndex.begin()) { - // Can't handle this load, bail out. - Info.UsingBlocks.push_back(LI->getParent()); - continue; - } - - // Otherwise, there was a store before this load, the load takes its value. - --I; - LI->replaceAllUsesWith(I->second->getOperand(0)); - if (AST && LI->getType()->isPointerTy()) - AST->deleteValue(LI); - LI->eraseFromParent(); - LBI.deleteValue(LI); - } -} - -// QueuePhiNode - queues a phi-node to be added to a basic-block for a specific -// Alloca returns true if there wasn't already a phi-node for that variable -// +/// Returns true if there wasn't already a phi-node for that variable bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo, unsigned &Version) { // Look up the basic-block in question. PHINode *&PN = NewPhiNodes[std::make_pair(BBNumbers[BB], AllocaNo)]; // If the BB already has a phi node added for the i'th alloca then we're done! - if (PN) return false; + if (PN) + return false; // Create a PhiNode using the dereferenced type... and add the phi-node to the // BasicBlock. PN = PHINode::Create(Allocas[AllocaNo]->getAllocatedType(), getNumPreds(BB), - Allocas[AllocaNo]->getName() + "." + Twine(Version++), + Allocas[AllocaNo]->getName() + "." + Twine(Version++), BB->begin()); ++NumPHIInsert; PhiToAllocaMap[PN] = AllocaNo; @@ -1019,10 +955,11 @@ bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo, return true; } -// RenamePass - Recursively traverse the CFG of the function, renaming loads and -// stores to the allocas which we are promoting. IncomingVals indicates what -// value each Alloca contains on exit from the predecessor block Pred. -// +/// \brief Recursively traverse the CFG of the function, renaming loads and +/// stores to the allocas which we are promoting. +/// +/// IncomingVals indicates what value each Alloca contains on exit from the +/// predecessor block Pred. void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred, RenamePassData::ValVector &IncomingVals, std::vector<RenamePassData> &Worklist) { @@ -1040,48 +977,49 @@ NextIteration: // inserted by this pass of mem2reg will have the same number of incoming // operands so far. Remember this count. unsigned NewPHINumOperands = APN->getNumOperands(); - - unsigned NumEdges = 0; - for (succ_iterator I = succ_begin(Pred), E = succ_end(Pred); I != E; ++I) - if (*I == BB) - ++NumEdges; + + unsigned NumEdges = std::count(succ_begin(Pred), succ_end(Pred), BB); assert(NumEdges && "Must be at least one edge from Pred to BB!"); - + // Add entries for all the phis. BasicBlock::iterator PNI = BB->begin(); do { unsigned AllocaNo = PhiToAllocaMap[APN]; - + // Add N incoming values to the PHI node. for (unsigned i = 0; i != NumEdges; ++i) APN->addIncoming(IncomingVals[AllocaNo], Pred); - + // The currently active variable for this block is now the PHI. IncomingVals[AllocaNo] = APN; - + // Get the next phi node. ++PNI; APN = dyn_cast<PHINode>(PNI); - if (APN == 0) break; - + if (APN == 0) + break; + // Verify that it is missing entries. If not, it is not being inserted // by this mem2reg invocation so we want to ignore it. } while (APN->getNumOperands() == NewPHINumOperands); } } - + // Don't revisit blocks. - if (!Visited.insert(BB)) return; + if (!Visited.insert(BB)) + return; - for (BasicBlock::iterator II = BB->begin(); !isa<TerminatorInst>(II); ) { + for (BasicBlock::iterator II = BB->begin(); !isa<TerminatorInst>(II);) { Instruction *I = II++; // get the instruction, increment iterator if (LoadInst *LI = dyn_cast<LoadInst>(I)) { AllocaInst *Src = dyn_cast<AllocaInst>(LI->getPointerOperand()); - if (!Src) continue; - - DenseMap<AllocaInst*, unsigned>::iterator AI = AllocaLookup.find(Src); - if (AI == AllocaLookup.end()) continue; + if (!Src) + continue; + + DenseMap<AllocaInst *, unsigned>::iterator AI = AllocaLookup.find(Src); + if (AI == AllocaLookup.end()) + continue; Value *V = IncomingVals[AI->second]; @@ -1094,30 +1032,29 @@ NextIteration: // Delete this instruction and mark the name as the current holder of the // value AllocaInst *Dest = dyn_cast<AllocaInst>(SI->getPointerOperand()); - if (!Dest) continue; - + if (!Dest) + continue; + DenseMap<AllocaInst *, unsigned>::iterator ai = AllocaLookup.find(Dest); if (ai == AllocaLookup.end()) continue; - + // what value were we writing? IncomingVals[ai->second] = SI->getOperand(0); // Record debuginfo for the store before removing it. - if (DbgDeclareInst *DDI = AllocaDbgDeclares[ai->second]) { - if (!DIB) - DIB = new DIBuilder(*SI->getParent()->getParent()->getParent()); - ConvertDebugDeclareToDebugValue(DDI, SI, *DIB); - } + if (DbgDeclareInst *DDI = AllocaDbgDeclares[ai->second]) + ConvertDebugDeclareToDebugValue(DDI, SI, DIB); BB->getInstList().erase(SI); } } // 'Recurse' to our successors. succ_iterator I = succ_begin(BB), E = succ_end(BB); - if (I == E) return; + if (I == E) + return; // Keep track of the successors so we don't visit the same successor twice - SmallPtrSet<BasicBlock*, 8> VisitedSuccs; + SmallPtrSet<BasicBlock *, 8> VisitedSuccs; // Handle the first successor without using the worklist. VisitedSuccs.insert(*I); @@ -1132,18 +1069,11 @@ NextIteration: goto NextIteration; } -/// PromoteMemToReg - Promote the specified list of alloca instructions into -/// scalar registers, inserting PHI nodes as appropriate. This function does -/// not modify the CFG of the function at all. All allocas must be from the -/// same function. -/// -/// If AST is specified, the specified tracker is updated to reflect changes -/// made to the IR. -/// -void llvm::PromoteMemToReg(const std::vector<AllocaInst*> &Allocas, - DominatorTree &DT, AliasSetTracker *AST) { +void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT, + AliasSetTracker *AST) { // If there is nothing to do, bail out... - if (Allocas.empty()) return; + if (Allocas.empty()) + return; PromoteMem2Reg(Allocas, DT, AST).run(); } diff --git a/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp index 9d90fbe..30adbfa 100644 --- a/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -42,8 +42,6 @@ SSAUpdater::~SSAUpdater() { delete static_cast<AvailableValsTy*>(AV); } -/// Initialize - Reset this object to get ready for a new set of SSA -/// updates with type 'Ty'. PHI nodes get a name based on 'Name'. void SSAUpdater::Initialize(Type *Ty, StringRef Name) { if (AV == 0) AV = new AvailableValsTy(); @@ -53,14 +51,10 @@ void SSAUpdater::Initialize(Type *Ty, StringRef Name) { ProtoName = Name; } -/// HasValueForBlock - Return true if the SSAUpdater already has a value for -/// the specified block. bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const { return getAvailableVals(AV).count(BB); } -/// AddAvailableValue - Indicate that a rewritten value is available in the -/// specified block with the specified value. void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) { assert(ProtoType != 0 && "Need to initialize SSAUpdater"); assert(ProtoType == V->getType() && @@ -68,10 +62,8 @@ void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) { getAvailableVals(AV)[BB] = V; } -/// IsEquivalentPHI - Check if PHI has the same incoming value as specified -/// in ValueMapping for each predecessor block. static bool IsEquivalentPHI(PHINode *PHI, - DenseMap<BasicBlock*, Value*> &ValueMapping) { + SmallDenseMap<BasicBlock*, Value*, 8> &ValueMapping) { unsigned PHINumValues = PHI->getNumIncomingValues(); if (PHINumValues != ValueMapping.size()) return false; @@ -86,32 +78,11 @@ static bool IsEquivalentPHI(PHINode *PHI, return true; } -/// GetValueAtEndOfBlock - Construct SSA form, materializing a value that is -/// live at the end of the specified block. Value *SSAUpdater::GetValueAtEndOfBlock(BasicBlock *BB) { Value *Res = GetValueAtEndOfBlockInternal(BB); return Res; } -/// GetValueInMiddleOfBlock - Construct SSA form, materializing a value that -/// is live in the middle of the specified block. -/// -/// GetValueInMiddleOfBlock is the same as GetValueAtEndOfBlock except in one -/// important case: if there is a definition of the rewritten value after the -/// 'use' in BB. Consider code like this: -/// -/// X1 = ... -/// SomeBB: -/// use(X) -/// X2 = ... -/// br Cond, SomeBB, OutBB -/// -/// In this case, there are two values (X1 and X2) added to the AvailableVals -/// set by the client of the rewriter, and those values are both live out of -/// their respective blocks. However, the use of X happens in the *middle* of -/// a block. Because of this, we need to insert a new PHI node in SomeBB to -/// merge the appropriate values, and this value isn't live out of the block. -/// Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { // If there is no definition of the renamed variable in this block, just use // GetValueAtEndOfBlock to do our work. @@ -165,8 +136,8 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { // Otherwise, we do need a PHI: check to see if we already have one available // in this block that produces the right value. if (isa<PHINode>(BB->begin())) { - DenseMap<BasicBlock*, Value*> ValueMapping(PredValues.begin(), - PredValues.end()); + SmallDenseMap<BasicBlock*, Value*, 8> ValueMapping(PredValues.begin(), + PredValues.end()); PHINode *SomePHI; for (BasicBlock::iterator It = BB->begin(); (SomePHI = dyn_cast<PHINode>(It)); ++It) { @@ -203,8 +174,6 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { return InsertedPHI; } -/// RewriteUse - Rewrite a use of the symbolic value. This handles PHI nodes, -/// which use their value in the corresponding predecessor. void SSAUpdater::RewriteUse(Use &U) { Instruction *User = cast<Instruction>(U.getUser()); @@ -222,10 +191,6 @@ void SSAUpdater::RewriteUse(Use &U) { U.set(V); } -/// RewriteUseAfterInsertions - Rewrite a use, just like RewriteUse. However, -/// this version of the method can rewrite uses in the same block as a -/// definition, because it assumes that all uses of a value are below any -/// inserted values. void SSAUpdater::RewriteUseAfterInsertions(Use &U) { Instruction *User = cast<Instruction>(U.getUser()); @@ -238,8 +203,6 @@ void SSAUpdater::RewriteUseAfterInsertions(Use &U) { U.set(V); } -/// SSAUpdaterTraits<SSAUpdater> - Traits for the SSAUpdaterImpl template, -/// specialized for SSAUpdater. namespace llvm { template<> class SSAUpdaterTraits<SSAUpdater> { @@ -342,10 +305,9 @@ public: } // End llvm namespace -/// GetValueAtEndOfBlockInternal - Check to see if AvailableVals has an entry -/// for the specified BB and if so, return it. If not, construct SSA form by -/// first calculating the required placement of PHIs and then inserting new -/// PHIs where needed. +/// Check to see if AvailableVals has an entry for the specified BB and if so, +/// return it. If not, construct SSA form by first calculating the required +/// placement of PHIs and then inserting new PHIs where needed. Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) { AvailableValsTy &AvailableVals = getAvailableVals(AV); if (Value *V = AvailableVals[BB]) diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 052ad85..ff50b12 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -40,12 +41,14 @@ #include "llvm/Support/ConstantRange.h" #include "llvm/Support/Debug.h" #include "llvm/Support/NoFolder.h" +#include "llvm/Support/PatternMatch.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include <algorithm> #include <map> #include <set> using namespace llvm; +using namespace PatternMatch; static cl::opt<unsigned> PHINodeFoldingThreshold("phi-node-folding-threshold", cl::Hidden, cl::init(1), @@ -88,7 +91,6 @@ namespace { class SimplifyCFGOpt { const TargetTransformInfo &TTI; const DataLayout *const TD; - Value *isValueEqualityComparison(TerminatorInst *TI); BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases); @@ -194,94 +196,7 @@ static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred, PN->addIncoming(PN->getIncomingValueForBlock(ExistPred), NewPred); } - -/// GetIfCondition - Given a basic block (BB) with two predecessors (and at -/// least one PHI node in it), check to see if the merge at this block is due -/// to an "if condition". If so, return the boolean condition that determines -/// which entry into BB will be taken. Also, return by references the block -/// that will be entered from if the condition is true, and the block that will -/// be entered if the condition is false. -/// -/// This does no checking to see if the true/false blocks have large or unsavory -/// instructions in them. -static Value *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue, - BasicBlock *&IfFalse) { - PHINode *SomePHI = cast<PHINode>(BB->begin()); - assert(SomePHI->getNumIncomingValues() == 2 && - "Function can only handle blocks with 2 predecessors!"); - BasicBlock *Pred1 = SomePHI->getIncomingBlock(0); - BasicBlock *Pred2 = SomePHI->getIncomingBlock(1); - - // We can only handle branches. Other control flow will be lowered to - // branches if possible anyway. - BranchInst *Pred1Br = dyn_cast<BranchInst>(Pred1->getTerminator()); - BranchInst *Pred2Br = dyn_cast<BranchInst>(Pred2->getTerminator()); - if (Pred1Br == 0 || Pred2Br == 0) - return 0; - - // Eliminate code duplication by ensuring that Pred1Br is conditional if - // either are. - if (Pred2Br->isConditional()) { - // If both branches are conditional, we don't have an "if statement". In - // reality, we could transform this case, but since the condition will be - // required anyway, we stand no chance of eliminating it, so the xform is - // probably not profitable. - if (Pred1Br->isConditional()) - return 0; - - std::swap(Pred1, Pred2); - std::swap(Pred1Br, Pred2Br); - } - - if (Pred1Br->isConditional()) { - // The only thing we have to watch out for here is to make sure that Pred2 - // doesn't have incoming edges from other blocks. If it does, the condition - // doesn't dominate BB. - if (Pred2->getSinglePredecessor() == 0) - return 0; - - // If we found a conditional branch predecessor, make sure that it branches - // to BB and Pred2Br. If it doesn't, this isn't an "if statement". - if (Pred1Br->getSuccessor(0) == BB && - Pred1Br->getSuccessor(1) == Pred2) { - IfTrue = Pred1; - IfFalse = Pred2; - } else if (Pred1Br->getSuccessor(0) == Pred2 && - Pred1Br->getSuccessor(1) == BB) { - IfTrue = Pred2; - IfFalse = Pred1; - } else { - // We know that one arm of the conditional goes to BB, so the other must - // go somewhere unrelated, and this must not be an "if statement". - return 0; - } - - return Pred1Br->getCondition(); - } - - // Ok, if we got here, both predecessors end with an unconditional branch to - // BB. Don't panic! If both blocks only have a single (identical) - // predecessor, and THAT is a conditional branch, then we're all ok! - BasicBlock *CommonPred = Pred1->getSinglePredecessor(); - if (CommonPred == 0 || CommonPred != Pred2->getSinglePredecessor()) - return 0; - - // Otherwise, if this is a conditional branch, then we can use it! - BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator()); - if (BI == 0) return 0; - - assert(BI->isConditional() && "Two successors but not conditional?"); - if (BI->getSuccessor(0) == Pred1) { - IfTrue = Pred1; - IfFalse = Pred2; - } else { - IfTrue = Pred2; - IfFalse = Pred1; - } - return BI->getCondition(); -} - -/// ComputeSpeculuationCost - Compute an abstract "cost" of speculating the +/// ComputeSpeculationCost - Compute an abstract "cost" of speculating the /// given instruction, which is assumed to be safe to speculate. 1 means /// cheap, 2 means less cheap, and UINT_MAX means prohibitively expensive. static unsigned ComputeSpeculationCost(const User *I) { @@ -432,7 +347,24 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra, // If this is an icmp against a constant, handle this as one of the cases. if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) { if (ConstantInt *C = GetConstantInt(I->getOperand(1), TD)) { + Value *RHSVal; + ConstantInt *RHSC; + if (ICI->getPredicate() == (isEQ ? ICmpInst::ICMP_EQ:ICmpInst::ICMP_NE)) { + // (x & ~2^x) == y --> x == y || x == y|2^x + // This undoes a transformation done by instcombine to fuse 2 compares. + if (match(ICI->getOperand(0), + m_And(m_Value(RHSVal), m_ConstantInt(RHSC)))) { + APInt Not = ~RHSC->getValue(); + if (Not.isPowerOf2()) { + Vals.push_back(C); + Vals.push_back( + ConstantInt::get(C->getContext(), C->getValue() | Not)); + UsedICmps++; + return RHSVal; + } + } + UsedICmps++; Vals.push_back(C); return I->getOperand(0); @@ -443,6 +375,13 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra, ConstantRange Span = ConstantRange::makeICmpRegion(ICI->getPredicate(), C->getValue()); + // Shift the range if the compare is fed by an add. This is the range + // compare idiom as emitted by instcombine. + bool hasAdd = + match(I->getOperand(0), m_Add(m_Value(RHSVal), m_ConstantInt(RHSC))); + if (hasAdd) + Span = Span.subtract(RHSC->getValue()); + // If this is an and/!= check then we want to optimize "x ugt 2" into // x != 0 && x != 1. if (!isEQ) @@ -455,7 +394,7 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra, for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp) Vals.push_back(ConstantInt::get(V->getContext(), Tmp)); UsedICmps++; - return I->getOperand(0); + return hasAdd ? RHSVal : I->getOperand(0); } return 0; } @@ -533,15 +472,17 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) { } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) if (BI->isConditional() && BI->getCondition()->hasOneUse()) if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) - if ((ICI->getPredicate() == ICmpInst::ICMP_EQ || - ICI->getPredicate() == ICmpInst::ICMP_NE) && - GetConstantInt(ICI->getOperand(1), TD)) + if (ICI->isEquality() && GetConstantInt(ICI->getOperand(1), TD)) CV = ICI->getOperand(0); // Unwrap any lossless ptrtoint cast. - if (TD && CV && CV->getType() == TD->getIntPtrType(CV->getContext())) - if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV)) - CV = PTII->getOperand(0); + if (TD && CV) { + if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV)) { + Value *Ptr = PTII->getPointerOperand(); + if (PTII->getType() == TD->getIntPtrType(Ptr->getType())) + CV = Ptr; + } + } return CV; } @@ -763,9 +704,10 @@ namespace { }; } -static int ConstantIntSortPredicate(const void *P1, const void *P2) { - const ConstantInt *LHS = *(const ConstantInt*const*)P1; - const ConstantInt *RHS = *(const ConstantInt*const*)P2; +static int ConstantIntSortPredicate(ConstantInt *const *P1, + ConstantInt *const *P2) { + const ConstantInt *LHS = *P1; + const ConstantInt *RHS = *P2; if (LHS->getValue().ult(RHS->getValue())) return 1; if (LHS->getValue() == RHS->getValue()) @@ -988,7 +930,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, // Convert pointer to int before we switch. if (CV->getType()->isPointerTy()) { assert(TD && "Cannot switch on pointer without DataLayout"); - CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getContext()), + CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getType()), "magicptr"); } @@ -1083,9 +1025,9 @@ static bool HoistThenElseCodeToIf(BranchInst *BI) { (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))) return false; - // If we get here, we can hoist at least one instruction. BasicBlock *BIParent = BI->getParent(); + bool Changed = false; do { // If we are hoisting the terminator instruction, don't move one (making a // broken BB), instead clone it, and remove BI. @@ -1100,6 +1042,7 @@ static bool HoistThenElseCodeToIf(BranchInst *BI) { I2->replaceAllUsesWith(I1); I1->intersectOptionalDataWith(I2); I2->eraseFromParent(); + Changed = true; I1 = BB1_Itr++; I2 = BB2_Itr++; @@ -1119,7 +1062,23 @@ static bool HoistThenElseCodeToIf(BranchInst *BI) { HoistTerminator: // It may not be possible to hoist an invoke. if (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2)) - return true; + return Changed; + + for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI) { + PHINode *PN; + for (BasicBlock::iterator BBI = SI->begin(); + (PN = dyn_cast<PHINode>(BBI)); ++BBI) { + Value *BB1V = PN->getIncomingValueForBlock(BB1); + Value *BB2V = PN->getIncomingValueForBlock(BB2); + if (BB1V == BB2V) + continue; + + if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V)) + return Changed; + if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V)) + return Changed; + } + } // Okay, it is safe to hoist the terminator. Instruction *NT = I1->clone(); @@ -1362,8 +1321,8 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { /// /// \return The pointer to the value of the previous store if the store can be /// hoisted into the predecessor block. 0 otherwise. -Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB, - BasicBlock *StoreBB, BasicBlock *EndBB) { +static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB, + BasicBlock *StoreBB, BasicBlock *EndBB) { StoreInst *StoreToHoist = dyn_cast<StoreInst>(I); if (!StoreToHoist) return 0; @@ -1522,18 +1481,23 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) { Value *OrigV = PN->getIncomingValueForBlock(BB); Value *ThenV = PN->getIncomingValueForBlock(ThenBB); + // FIXME: Try to remove some of the duplication with HoistThenElseCodeToIf. // Skip PHIs which are trivial. if (ThenV == OrigV) continue; HaveRewritablePHIs = true; - ConstantExpr *CE = dyn_cast<ConstantExpr>(ThenV); - if (!CE) + ConstantExpr *OrigCE = dyn_cast<ConstantExpr>(OrigV); + ConstantExpr *ThenCE = dyn_cast<ConstantExpr>(ThenV); + if (!OrigCE && !ThenCE) continue; // Known safe and cheap. - if (!isSafeToSpeculativelyExecute(CE)) + if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE)) || + (OrigCE && !isSafeToSpeculativelyExecute(OrigCE))) return false; - if (ComputeSpeculationCost(CE) > PHINodeFoldingThreshold) + unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE) : 0; + unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE) : 0; + if (OrigCost + ThenCost > 2 * PHINodeFoldingThreshold) return false; // Account for the cost of an unfolded ConstantExpr which could end up @@ -1598,6 +1562,19 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) { return true; } +/// \returns True if this block contains a CallInst with the NoDuplicate +/// attribute. +static bool HasNoDuplicateCall(const BasicBlock *BB) { + for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + const CallInst *CI = dyn_cast<CallInst>(I); + if (!CI) + continue; + if (CI->cannotDuplicate()) + return true; + } + return false; +} + /// BlockIsSimpleEnoughToThreadThrough - Return true if we can thread a branch /// across this block. static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) { @@ -1645,6 +1622,8 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *TD) { // Now we know that this block has multiple preds and two succs. if (!BlockIsSimpleEnoughToThreadThrough(BB)) return false; + if (HasNoDuplicateCall(BB)) return false; + // Okay, this is a simple enough basic block. See if any phi values are // constants. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { @@ -2111,14 +2090,19 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { // Ensure that any values used in the bonus instruction are also used // by the terminator of the predecessor. This means that those values // must already have been resolved, so we won't be inhibiting the - // out-of-order core by speculating them earlier. - if (BonusInst) { + // out-of-order core by speculating them earlier. We also allow + // instructions that are used by the terminator's condition because it + // exposes more merging opportunities. + bool UsedByBranch = (BonusInst && BonusInst->hasOneUse() && + *BonusInst->use_begin() == Cond); + + if (BonusInst && !UsedByBranch) { // Collect the values used by the bonus inst SmallPtrSet<Value*, 4> UsedValues; for (Instruction::op_iterator OI = BonusInst->op_begin(), OE = BonusInst->op_end(); OI != OE; ++OI) { Value *V = *OI; - if (!isa<Constant>(V)) + if (!isa<Constant>(V) && !isa<Argument>(V)) UsedValues.insert(V); } @@ -2829,7 +2813,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *TD, if (CompVal->getType()->isPointerTy()) { assert(TD && "Cannot switch on pointer without DataLayout"); CompVal = Builder.CreatePtrToInt(CompVal, - TD->getIntPtrType(CompVal->getContext()), + TD->getIntPtrType(CompVal->getType()), "magicptr"); } @@ -3202,7 +3186,7 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) { /// and use it to remove dead cases. static bool EliminateDeadSwitchCases(SwitchInst *SI) { Value *Cond = SI->getCondition(); - unsigned Bits = cast<IntegerType>(Cond->getType())->getBitWidth(); + unsigned Bits = Cond->getType()->getIntegerBitWidth(); APInt KnownZero(Bits, 0), KnownOne(Bits, 0); ComputeMaskedBits(Cond, KnownZero, KnownOne); @@ -3307,7 +3291,7 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) { for (ForwardingNodesMap::iterator I = ForwardingNodes.begin(), E = ForwardingNodes.end(); I != E; ++I) { PHINode *Phi = I->first; - SmallVector<int,4> &Indexes = I->second; + SmallVectorImpl<int> &Indexes = I->second; if (Indexes.size() < 2) continue; @@ -3345,28 +3329,10 @@ static Constant *LookupConstant(Value *V, /// simple instructions such as binary operations where both operands are /// constant or can be replaced by constants from the ConstantPool. Returns the /// resulting constant on success, 0 otherwise. -static Constant *ConstantFold(Instruction *I, - const SmallDenseMap<Value*, Constant*>& ConstantPool) { - if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { - Constant *A = LookupConstant(BO->getOperand(0), ConstantPool); - if (!A) - return 0; - Constant *B = LookupConstant(BO->getOperand(1), ConstantPool); - if (!B) - return 0; - return ConstantExpr::get(BO->getOpcode(), A, B); - } - - if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) { - Constant *A = LookupConstant(I->getOperand(0), ConstantPool); - if (!A) - return 0; - Constant *B = LookupConstant(I->getOperand(1), ConstantPool); - if (!B) - return 0; - return ConstantExpr::getCompare(Cmp->getPredicate(), A, B); - } - +static Constant * +ConstantFold(Instruction *I, + const SmallDenseMap<Value *, Constant *> &ConstantPool, + const DataLayout *DL) { if (SelectInst *Select = dyn_cast<SelectInst>(I)) { Constant *A = LookupConstant(Select->getCondition(), ConstantPool); if (!A) @@ -3378,25 +3344,32 @@ static Constant *ConstantFold(Instruction *I, return 0; } - if (CastInst *Cast = dyn_cast<CastInst>(I)) { - Constant *A = LookupConstant(I->getOperand(0), ConstantPool); - if (!A) + SmallVector<Constant *, 4> COps; + for (unsigned N = 0, E = I->getNumOperands(); N != E; ++N) { + if (Constant *A = LookupConstant(I->getOperand(N), ConstantPool)) + COps.push_back(A); + else return 0; - return ConstantExpr::getCast(Cast->getOpcode(), A, Cast->getDestTy()); } - return 0; + if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) + return ConstantFoldCompareInstOperands(Cmp->getPredicate(), COps[0], + COps[1], DL); + + return ConstantFoldInstOperands(I->getOpcode(), I->getType(), COps, DL); } /// GetCaseResults - Try to determine the resulting constant values in phi nodes /// at the common destination basic block, *CommonDest, for one of the case /// destionations CaseDest corresponding to value CaseVal (0 for the default /// case), of a switch instruction SI. -static bool GetCaseResults(SwitchInst *SI, - ConstantInt *CaseVal, - BasicBlock *CaseDest, - BasicBlock **CommonDest, - SmallVector<std::pair<PHINode*,Constant*>, 4> &Res) { +static bool +GetCaseResults(SwitchInst *SI, + ConstantInt *CaseVal, + BasicBlock *CaseDest, + BasicBlock **CommonDest, + SmallVectorImpl<std::pair<PHINode *, Constant *> > &Res, + const DataLayout *DL) { // The block from which we enter the common destination. BasicBlock *Pred = SI->getParent(); @@ -3415,7 +3388,7 @@ static bool GetCaseResults(SwitchInst *SI, } else if (isa<DbgInfoIntrinsic>(I)) { // Skip debug intrinsic. continue; - } else if (Constant *C = ConstantFold(I, ConstantPool)) { + } else if (Constant *C = ConstantFold(I, ConstantPool, DL)) { // Instruction is side-effect free and constant. ConstantPool.insert(std::make_pair(I, C)); } else { @@ -3469,7 +3442,7 @@ namespace { SwitchLookupTable(Module &M, uint64_t TableSize, ConstantInt *Offset, - const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Values, + const SmallVectorImpl<std::pair<ConstantInt*, Constant*> >& Values, Constant *DefaultValue, const DataLayout *TD); @@ -3516,7 +3489,7 @@ namespace { SwitchLookupTable::SwitchLookupTable(Module &M, uint64_t TableSize, ConstantInt *Offset, - const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Values, + const SmallVectorImpl<std::pair<ConstantInt*, Constant*> >& Values, Constant *DefaultValue, const DataLayout *TD) : SingleValue(0), BitMap(0), BitMapElementTy(0), Array(0) { @@ -3643,7 +3616,7 @@ bool SwitchLookupTable::WouldFitInRegister(const DataLayout *TD, } /// ShouldBuildLookupTable - Determine whether a lookup table should be built -/// for this switch, based on the number of caes, size of the table and the +/// for this switch, based on the number of cases, size of the table and the /// types of the results. static bool ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize, @@ -3739,7 +3712,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, typedef SmallVector<std::pair<PHINode*, Constant*>, 4> ResultsTy; ResultsTy Results; if (!GetCaseResults(SI, CaseVal, CI.getCaseSuccessor(), &CommonDest, - Results)) + Results, TD)) return false; // Append the result from this case to the list for each phi. @@ -3753,7 +3726,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, // Get the resulting values for the default case. SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList; if (!GetCaseResults(SI, 0, SI->getDefaultDest(), &CommonDest, - DefaultResultsList)) + DefaultResultsList, TD)) return false; for (size_t I = 0, E = DefaultResultsList.size(); I != E; ++I) { PHINode *PHI = DefaultResultsList[I].first; @@ -3774,14 +3747,32 @@ static bool SwitchToLookupTable(SwitchInst *SI, CommonDest->getParent(), CommonDest); - // Check whether the condition value is within the case range, and branch to - // the new BB. + // Compute the table index value. Builder.SetInsertPoint(SI); Value *TableIndex = Builder.CreateSub(SI->getCondition(), MinCaseVal, "switch.tableidx"); - Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get( - MinCaseVal->getType(), TableSize)); - Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest()); + + // Compute the maximum table size representable by the integer type we are + // switching upon. + unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits(); + uint64_t MaxTableSize = CaseSize > 63? UINT64_MAX : 1ULL << CaseSize; + assert(MaxTableSize >= TableSize && + "It is impossible for a switch to have more entries than the max " + "representable value of its input integer type's size."); + + // If we have a fully covered lookup table, unconditionally branch to the + // lookup table BB. Otherwise, check if the condition value is within the case + // range. If it is so, branch to the new BB. Otherwise branch to SI's default + // destination. + const bool GeneratingCoveredLookupTable = MaxTableSize == TableSize; + if (GeneratingCoveredLookupTable) { + Builder.CreateBr(LookupBB); + SI->getDefaultDest()->removePredecessor(SI->getParent()); + } else { + Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get( + MinCaseVal->getType(), TableSize)); + Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest()); + } // Populate the BB that does the lookups. Builder.SetInsertPoint(LookupBB); @@ -3810,9 +3801,11 @@ static bool SwitchToLookupTable(SwitchInst *SI, Builder.CreateBr(CommonDest); // Remove the switch. - for (unsigned i = 0; i < SI->getNumSuccessors(); ++i) { + for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) { BasicBlock *Succ = SI->getSuccessor(i); - if (Succ == SI->getDefaultDest()) continue; + + if (Succ == SI->getDefaultDest()) + continue; Succ->removePredecessor(SI->getParent()); } SI->eraseFromParent(); diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index 41c207c..bf3442a 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -119,7 +119,7 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) return 0; D = ConstantInt::get(UseInst->getContext(), - APInt(BitWidth, 1).shl(D->getZExtValue())); + APInt::getOneBitSet(BitWidth, D->getZExtValue())); } FoldedExpr = SE->getUDivExpr(SE->getSCEV(IVSrc), SE->getSCEV(D)); } diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 6bea2dd..15b3e66 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -17,6 +17,7 @@ #include "llvm/Transforms/Utils/SimplifyLibCalls.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringMap.h" +#include "llvm/ADT/Triple.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" @@ -26,11 +27,16 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" using namespace llvm; +static cl::opt<bool> +ColdErrorCalls("error-reporting-is-cold", cl::init(true), + cl::Hidden, cl::desc("Treat error-reporting calls as cold")); + /// This class is the abstract base class for the set of optimizations that /// corresponds to one library call. namespace { @@ -118,6 +124,21 @@ static bool callHasFloatingPointArgument(const CallInst *CI) { return false; } +/// \brief Check whether the overloaded unary floating point function +/// corresponing to \a Ty is available. +static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty, + LibFunc::Func DoubleFn, LibFunc::Func FloatFn, + LibFunc::Func LongDoubleFn) { + switch (Ty->getTypeID()) { + case Type::FloatTyID: + return TLI->has(FloatFn); + case Type::DoubleTyID: + return TLI->has(DoubleFn); + default: + return TLI->has(LongDoubleFn); + } +} + //===----------------------------------------------------------------------===// // Fortified Library Call Optimizations //===----------------------------------------------------------------------===// @@ -477,7 +498,7 @@ struct StrChrOpt : public LibCallOptimization { // Compute the offset, make sure to handle the case when we're searching for // zero (a weird way to spell strlen). - size_t I = CharC->getSExtValue() == 0 ? + size_t I = (0xFF & CharC->getSExtValue()) == 0 ? Str.size() : Str.find(CharC->getSExtValue()); if (I == StringRef::npos) // Didn't find the char. strchr returns null. return Constant::getNullValue(CI->getType()); @@ -513,7 +534,7 @@ struct StrRChrOpt : public LibCallOptimization { } // Compute the offset. - size_t I = CharC->getSExtValue() == 0 ? + size_t I = (0xFF & CharC->getSExtValue()) == 0 ? Str.size() : Str.rfind(CharC->getSExtValue()); if (I == StringRef::npos) // Didn't find the char. Return null. return Constant::getNullValue(CI->getType()); @@ -774,7 +795,7 @@ struct StrPBrkOpt : public LibCallOptimization { // Constant folding. if (HasS1 && HasS2) { size_t I = S1.find_first_of(S2); - if (I == std::string::npos) // No match. + if (I == StringRef::npos) // No match. return Constant::getNullValue(CI->getType()); return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk"); @@ -912,7 +933,7 @@ struct StrStrOpt : public LibCallOptimization { // If both strings are known, constant fold it. if (HasStr1 && HasStr2) { - std::string::size_type Offset = SearchStr.find(ToFindStr); + size_t Offset = SearchStr.find(ToFindStr); if (Offset == StringRef::npos) // strstr("foo", "bar") -> null return Constant::getNullValue(CI->getType()); @@ -1031,7 +1052,7 @@ struct MemSetOpt : public LibCallOptimization { if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isIntegerTy() || - FT->getParamType(2) != TD->getIntPtrType(*Context)) + FT->getParamType(2) != TD->getIntPtrType(FT->getParamType(0))) return 0; // memset(p, v, n) -> llvm.memset(p, v, n, 1) @@ -1133,9 +1154,13 @@ struct PowOpt : public UnsafeFPLibCallOptimization { Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1); if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) { - if (Op1C->isExactlyValue(1.0)) // pow(1.0, x) -> 1.0 + // pow(1.0, x) -> 1.0 + if (Op1C->isExactlyValue(1.0)) return Op1C; - if (Op1C->isExactlyValue(2.0)) // pow(2.0, x) -> exp2(x) + // pow(2.0, x) -> exp2(x) + if (Op1C->isExactlyValue(2.0) && + hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp2, LibFunc::exp2f, + LibFunc::exp2l)) return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes()); } @@ -1145,7 +1170,11 @@ struct PowOpt : public UnsafeFPLibCallOptimization { if (Op2C->getValueAPF().isZero()) // pow(x, 0.0) -> 1.0 return ConstantFP::get(CI->getType(), 1.0); - if (Op2C->isExactlyValue(0.5)) { + if (Op2C->isExactlyValue(0.5) && + hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf, + LibFunc::sqrtl) && + hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::fabs, LibFunc::fabsf, + LibFunc::fabsl)) { // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))). // This is faster than calling pow, and still handles negative zero // and negative infinity correctly. @@ -1178,7 +1207,7 @@ struct Exp2Opt : public UnsafeFPLibCallOptimization { virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { Value *Ret = NULL; if (UnsafeFPShrink && Callee->getName() == "exp2" && - TLI->has(LibFunc::exp2)) { + TLI->has(LibFunc::exp2f)) { UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B); } @@ -1229,6 +1258,155 @@ struct Exp2Opt : public UnsafeFPLibCallOptimization { } }; +struct SinCosPiOpt : public LibCallOptimization { + SinCosPiOpt() {} + + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Make sure the prototype is as expected, otherwise the rest of the + // function is probably invalid and likely to abort. + if (!isTrigLibCall(CI)) + return 0; + + Value *Arg = CI->getArgOperand(0); + SmallVector<CallInst *, 1> SinCalls; + SmallVector<CallInst *, 1> CosCalls; + SmallVector<CallInst *, 1> SinCosCalls; + + bool IsFloat = Arg->getType()->isFloatTy(); + + // Look for all compatible sinpi, cospi and sincospi calls with the same + // argument. If there are enough (in some sense) we can make the + // substitution. + for (Value::use_iterator UI = Arg->use_begin(), UE = Arg->use_end(); + UI != UE; ++UI) + classifyArgUse(*UI, CI->getParent(), IsFloat, SinCalls, CosCalls, + SinCosCalls); + + // It's only worthwhile if both sinpi and cospi are actually used. + if (SinCosCalls.empty() && (SinCalls.empty() || CosCalls.empty())) + return 0; + + Value *Sin, *Cos, *SinCos; + insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos, + SinCos); + + replaceTrigInsts(SinCalls, Sin); + replaceTrigInsts(CosCalls, Cos); + replaceTrigInsts(SinCosCalls, SinCos); + + return 0; + } + + bool isTrigLibCall(CallInst *CI) { + Function *Callee = CI->getCalledFunction(); + FunctionType *FT = Callee->getFunctionType(); + + // We can only hope to do anything useful if we can ignore things like errno + // and floating-point exceptions. + bool AttributesSafe = CI->hasFnAttr(Attribute::NoUnwind) && + CI->hasFnAttr(Attribute::ReadNone); + + // Other than that we need float(float) or double(double) + return AttributesSafe && FT->getNumParams() == 1 && + FT->getReturnType() == FT->getParamType(0) && + (FT->getParamType(0)->isFloatTy() || + FT->getParamType(0)->isDoubleTy()); + } + + void classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat, + SmallVectorImpl<CallInst *> &SinCalls, + SmallVectorImpl<CallInst *> &CosCalls, + SmallVectorImpl<CallInst *> &SinCosCalls) { + CallInst *CI = dyn_cast<CallInst>(Val); + + if (!CI) + return; + + Function *Callee = CI->getCalledFunction(); + StringRef FuncName = Callee->getName(); + LibFunc::Func Func; + if (!TLI->getLibFunc(FuncName, Func) || !TLI->has(Func) || + !isTrigLibCall(CI)) + return; + + if (IsFloat) { + if (Func == LibFunc::sinpif) + SinCalls.push_back(CI); + else if (Func == LibFunc::cospif) + CosCalls.push_back(CI); + else if (Func == LibFunc::sincospi_stretf) + SinCosCalls.push_back(CI); + } else { + if (Func == LibFunc::sinpi) + SinCalls.push_back(CI); + else if (Func == LibFunc::cospi) + CosCalls.push_back(CI); + else if (Func == LibFunc::sincospi_stret) + SinCosCalls.push_back(CI); + } + } + + void replaceTrigInsts(SmallVectorImpl<CallInst*> &Calls, Value *Res) { + for (SmallVectorImpl<CallInst*>::iterator I = Calls.begin(), + E = Calls.end(); + I != E; ++I) { + LCS->replaceAllUsesWith(*I, Res); + } + } + + void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg, + bool UseFloat, Value *&Sin, Value *&Cos, + Value *&SinCos) { + Type *ArgTy = Arg->getType(); + Type *ResTy; + StringRef Name; + + Triple T(OrigCallee->getParent()->getTargetTriple()); + if (UseFloat) { + Name = "__sincospi_stretf"; + + assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now"); + // x86_64 can't use {float, float} since that would be returned in both + // xmm0 and xmm1, which isn't what a real struct would do. + ResTy = T.getArch() == Triple::x86_64 + ? static_cast<Type *>(VectorType::get(ArgTy, 2)) + : static_cast<Type *>(StructType::get(ArgTy, ArgTy, NULL)); + } else { + Name = "__sincospi_stret"; + ResTy = StructType::get(ArgTy, ArgTy, NULL); + } + + Module *M = OrigCallee->getParent(); + Value *Callee = M->getOrInsertFunction(Name, OrigCallee->getAttributes(), + ResTy, ArgTy, NULL); + + if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) { + // If the argument is an instruction, it must dominate all uses so put our + // sincos call there. + BasicBlock::iterator Loc = ArgInst; + B.SetInsertPoint(ArgInst->getParent(), ++Loc); + } else { + // Otherwise (e.g. for a constant) the beginning of the function is as + // good a place as any. + BasicBlock &EntryBB = B.GetInsertBlock()->getParent()->getEntryBlock(); + B.SetInsertPoint(&EntryBB, EntryBB.begin()); + } + + SinCos = B.CreateCall(Callee, Arg, "sincospi"); + + if (SinCos->getType()->isStructTy()) { + Sin = B.CreateExtractValue(SinCos, 0, "sinpi"); + Cos = B.CreateExtractValue(SinCos, 1, "cospi"); + } else { + Sin = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 0), + "sinpi"); + Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1), + "cospi"); + } + } + +}; + //===----------------------------------------------------------------------===// // Integer Library Call Optimizations //===----------------------------------------------------------------------===// @@ -1333,6 +1511,54 @@ struct ToAsciiOpt : public LibCallOptimization { // Formatting and IO Library Call Optimizations //===----------------------------------------------------------------------===// +struct ErrorReportingOpt : public LibCallOptimization { + ErrorReportingOpt(int S = -1) : StreamArg(S) {} + + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &) { + // Error reporting calls should be cold, mark them as such. + // This applies even to non-builtin calls: it is only a hint and applies to + // functions that the frontend might not understand as builtins. + + // This heuristic was suggested in: + // Improving Static Branch Prediction in a Compiler + // Brian L. Deitrich, Ben-Chung Cheng, Wen-mei W. Hwu + // Proceedings of PACT'98, Oct. 1998, IEEE + + if (!CI->hasFnAttr(Attribute::Cold) && isReportingError(Callee, CI)) { + CI->addAttribute(AttributeSet::FunctionIndex, Attribute::Cold); + } + + return 0; + } + +protected: + bool isReportingError(Function *Callee, CallInst *CI) { + if (!ColdErrorCalls) + return false; + + if (!Callee || !Callee->isDeclaration()) + return false; + + if (StreamArg < 0) + return true; + + // These functions might be considered cold, but only if their stream + // argument is stderr. + + if (StreamArg >= (int) CI->getNumArgOperands()) + return false; + LoadInst *LI = dyn_cast<LoadInst>(CI->getArgOperand(StreamArg)); + if (!LI) + return false; + GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand()); + if (!GV || !GV->isDeclaration()) + return false; + return GV->getName() == "stderr"; + } + + int StreamArg; +}; + struct PrintFOpt : public LibCallOptimization { Value *optimizeFixedFormatString(Function *Callee, CallInst *CI, IRBuilder<> &B) { @@ -1361,7 +1587,7 @@ struct PrintFOpt : public LibCallOptimization { // printf("foo\n") --> puts("foo") if (FormatStr[FormatStr.size()-1] == '\n' && - FormatStr.find('%') == std::string::npos) { // no format characters. + FormatStr.find('%') == StringRef::npos) { // No format characters. // Create a string literal with no \n on it. We expect the constant merge // pass to be run after this pass, to merge duplicate strings. FormatStr = FormatStr.drop_back(); @@ -1513,6 +1739,9 @@ struct SPrintFOpt : public LibCallOptimization { struct FPrintFOpt : public LibCallOptimization { Value *optimizeFixedFormatString(Function *Callee, CallInst *CI, IRBuilder<> &B) { + ErrorReportingOpt ER(/* StreamArg = */ 0); + (void) ER.callOptimizer(Callee, CI, B); + // All the optimizations depend on the format string. StringRef FormatStr; if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr)) @@ -1590,6 +1819,9 @@ struct FPrintFOpt : public LibCallOptimization { struct FWriteOpt : public LibCallOptimization { virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + ErrorReportingOpt ER(/* StreamArg = */ 3); + (void) ER.callOptimizer(Callee, CI, B); + // Require a pointer, an integer, an integer, a pointer, returning integer. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() || @@ -1623,6 +1855,9 @@ struct FWriteOpt : public LibCallOptimization { struct FPutsOpt : public LibCallOptimization { virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + ErrorReportingOpt ER(/* StreamArg = */ 1); + (void) ER.callOptimizer(Callee, CI, B); + // These optimizations require DataLayout. if (!TD) return 0; @@ -1741,6 +1976,7 @@ static MemSetOpt MemSet; // Math library call optimizations. static UnaryDoubleFPOpt UnaryDoubleFP(false); static UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); +static SinCosPiOpt SinCosPi; // Integer library call optimizations. static FFSOpt FFS; @@ -1750,6 +1986,9 @@ static IsAsciiOpt IsAscii; static ToAsciiOpt ToAscii; // Formatting and IO library call optimizations. +static ErrorReportingOpt ErrorReporting; +static ErrorReportingOpt ErrorReporting0(0); +static ErrorReportingOpt ErrorReporting1(1); static PrintFOpt PrintF; static SPrintFOpt SPrintF; static FPrintFOpt FPrintF; @@ -1825,6 +2064,11 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) { case LibFunc::cos: case LibFunc::cosl: return &Cos; + case LibFunc::sinpif: + case LibFunc::sinpi: + case LibFunc::cospif: + case LibFunc::cospi: + return &SinCosPi; case LibFunc::powf: case LibFunc::pow: case LibFunc::powl: @@ -1859,6 +2103,13 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) { return &FPuts; case LibFunc::puts: return &Puts; + case LibFunc::perror: + return &ErrorReporting; + case LibFunc::vfprintf: + case LibFunc::fiprintf: + return &ErrorReporting0; + case LibFunc::fputc: + return &ErrorReporting1; case LibFunc::ceil: case LibFunc::fabs: case LibFunc::floor: @@ -1940,7 +2191,7 @@ LibCallSimplifier::~LibCallSimplifier() { } Value *LibCallSimplifier::optimizeCall(CallInst *CI) { - if (CI->hasFnAttr(Attribute::NoBuiltin)) return 0; + if (CI->isNoBuiltin()) return 0; return Impl->optimizeCall(CI); } @@ -1950,3 +2201,53 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) const { } } + +// TODO: +// Additional cases that we need to add to this file: +// +// cbrt: +// * cbrt(expN(X)) -> expN(x/3) +// * cbrt(sqrt(x)) -> pow(x,1/6) +// * cbrt(sqrt(x)) -> pow(x,1/9) +// +// exp, expf, expl: +// * exp(log(x)) -> x +// +// log, logf, logl: +// * log(exp(x)) -> x +// * log(x**y) -> y*log(x) +// * log(exp(y)) -> y*log(e) +// * log(exp2(y)) -> y*log(2) +// * log(exp10(y)) -> y*log(10) +// * log(sqrt(x)) -> 0.5*log(x) +// * log(pow(x,y)) -> y*log(x) +// +// lround, lroundf, lroundl: +// * lround(cnst) -> cnst' +// +// pow, powf, powl: +// * pow(exp(x),y) -> exp(x*y) +// * pow(sqrt(x),y) -> pow(x,y*0.5) +// * pow(pow(x,y),z)-> pow(x,y*z) +// +// round, roundf, roundl: +// * round(cnst) -> cnst' +// +// signbit: +// * signbit(cnst) -> cnst' +// * signbit(nncst) -> 0 (if pstv is a non-negative constant) +// +// sqrt, sqrtf, sqrtl: +// * sqrt(expN(x)) -> expN(x*0.5) +// * sqrt(Nroot(x)) -> pow(x,1/(2*N)) +// * sqrt(pow(x,y)) -> pow(|x|,y*0.5) +// +// strchr: +// * strchr(p, 0) -> strlen(p) +// tan, tanf, tanl: +// * tan(atan(x)) -> x +// +// trunc, truncf, truncl: +// * trunc(cnst) -> cnst' +// +// diff --git a/contrib/llvm/lib/Transforms/Utils/SpecialCaseList.cpp b/contrib/llvm/lib/Transforms/Utils/SpecialCaseList.cpp new file mode 100644 index 0000000..2ef692c --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/SpecialCaseList.cpp @@ -0,0 +1,222 @@ +//===-- SpecialCaseList.cpp - special case list for sanitizers ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a utility class for instrumentation passes (like AddressSanitizer +// or ThreadSanitizer) to avoid instrumenting some functions or global +// variables, or to instrument some functions or global variables in a specific +// way, based on a user-supplied list. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/SpecialCaseList.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Regex.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/system_error.h" +#include <string> +#include <utility> + +namespace llvm { + +/// Represents a set of regular expressions. Regular expressions which are +/// "literal" (i.e. no regex metacharacters) are stored in Strings, while all +/// others are represented as a single pipe-separated regex in RegEx. The +/// reason for doing so is efficiency; StringSet is much faster at matching +/// literal strings than Regex. +struct SpecialCaseList::Entry { + StringSet<> Strings; + Regex *RegEx; + + Entry() : RegEx(0) {} + + bool match(StringRef Query) const { + return Strings.count(Query) || (RegEx && RegEx->match(Query)); + } +}; + +SpecialCaseList::SpecialCaseList() : Entries() {} + +SpecialCaseList *SpecialCaseList::create( + const StringRef Path, std::string &Error) { + if (Path.empty()) + return new SpecialCaseList(); + OwningPtr<MemoryBuffer> File; + if (error_code EC = MemoryBuffer::getFile(Path, File)) { + Error = (Twine("Can't open file '") + Path + "': " + EC.message()).str(); + return 0; + } + return create(File.get(), Error); +} + +SpecialCaseList *SpecialCaseList::create( + const MemoryBuffer *MB, std::string &Error) { + OwningPtr<SpecialCaseList> SCL(new SpecialCaseList()); + if (!SCL->parse(MB, Error)) + return 0; + return SCL.take(); +} + +SpecialCaseList *SpecialCaseList::createOrDie(const StringRef Path) { + std::string Error; + if (SpecialCaseList *SCL = create(Path, Error)) + return SCL; + report_fatal_error(Error); +} + +bool SpecialCaseList::parse(const MemoryBuffer *MB, std::string &Error) { + // Iterate through each line in the blacklist file. + SmallVector<StringRef, 16> Lines; + SplitString(MB->getBuffer(), Lines, "\n\r"); + StringMap<StringMap<std::string> > Regexps; + assert(Entries.empty() && + "parse() should be called on an empty SpecialCaseList"); + int LineNo = 1; + for (SmallVectorImpl<StringRef>::iterator I = Lines.begin(), E = Lines.end(); + I != E; ++I, ++LineNo) { + // Ignore empty lines and lines starting with "#" + if (I->empty() || I->startswith("#")) + continue; + // Get our prefix and unparsed regexp. + std::pair<StringRef, StringRef> SplitLine = I->split(":"); + StringRef Prefix = SplitLine.first; + if (SplitLine.second.empty()) { + // Missing ':' in the line. + Error = (Twine("Malformed line ") + Twine(LineNo) + ": '" + + SplitLine.first + "'").str(); + return false; + } + + std::pair<StringRef, StringRef> SplitRegexp = SplitLine.second.split("="); + std::string Regexp = SplitRegexp.first; + StringRef Category = SplitRegexp.second; + + // Backwards compatibility. + if (Prefix == "global-init") { + Prefix = "global"; + Category = "init"; + } else if (Prefix == "global-init-type") { + Prefix = "type"; + Category = "init"; + } else if (Prefix == "global-init-src") { + Prefix = "src"; + Category = "init"; + } + + // See if we can store Regexp in Strings. + if (Regex::isLiteralERE(Regexp)) { + Entries[Prefix][Category].Strings.insert(Regexp); + continue; + } + + // Replace * with .* + for (size_t pos = 0; (pos = Regexp.find("*", pos)) != std::string::npos; + pos += strlen(".*")) { + Regexp.replace(pos, strlen("*"), ".*"); + } + + // Check that the regexp is valid. + Regex CheckRE(Regexp); + std::string REError; + if (!CheckRE.isValid(REError)) { + Error = (Twine("Malformed regex in line ") + Twine(LineNo) + ": '" + + SplitLine.second + "': " + REError).str(); + return false; + } + + // Add this regexp into the proper group by its prefix. + if (!Regexps[Prefix][Category].empty()) + Regexps[Prefix][Category] += "|"; + Regexps[Prefix][Category] += "^" + Regexp + "$"; + } + + // Iterate through each of the prefixes, and create Regexs for them. + for (StringMap<StringMap<std::string> >::const_iterator I = Regexps.begin(), + E = Regexps.end(); + I != E; ++I) { + for (StringMap<std::string>::const_iterator II = I->second.begin(), + IE = I->second.end(); + II != IE; ++II) { + Entries[I->getKey()][II->getKey()].RegEx = new Regex(II->getValue()); + } + } + return true; +} + +SpecialCaseList::~SpecialCaseList() { + for (StringMap<StringMap<Entry> >::iterator I = Entries.begin(), + E = Entries.end(); + I != E; ++I) { + for (StringMap<Entry>::const_iterator II = I->second.begin(), + IE = I->second.end(); + II != IE; ++II) { + delete II->second.RegEx; + } + } +} + +bool SpecialCaseList::isIn(const Function& F, const StringRef Category) const { + return isIn(*F.getParent(), Category) || + inSectionCategory("fun", F.getName(), Category); +} + +static StringRef GetGlobalTypeString(const GlobalValue &G) { + // Types of GlobalVariables are always pointer types. + Type *GType = G.getType()->getElementType(); + // For now we support blacklisting struct types only. + if (StructType *SGType = dyn_cast<StructType>(GType)) { + if (!SGType->isLiteral()) + return SGType->getName(); + } + return "<unknown type>"; +} + +bool SpecialCaseList::isIn(const GlobalVariable &G, + const StringRef Category) const { + return isIn(*G.getParent(), Category) || + inSectionCategory("global", G.getName(), Category) || + inSectionCategory("type", GetGlobalTypeString(G), Category); +} + +bool SpecialCaseList::isIn(const GlobalAlias &GA, + const StringRef Category) const { + if (isIn(*GA.getParent(), Category)) + return true; + + if (isa<FunctionType>(GA.getType()->getElementType())) + return inSectionCategory("fun", GA.getName(), Category); + + return inSectionCategory("global", GA.getName(), Category) || + inSectionCategory("type", GetGlobalTypeString(GA), Category); +} + +bool SpecialCaseList::isIn(const Module &M, const StringRef Category) const { + return inSectionCategory("src", M.getModuleIdentifier(), Category); +} + +bool SpecialCaseList::inSectionCategory(const StringRef Section, + const StringRef Query, + const StringRef Category) const { + StringMap<StringMap<Entry> >::const_iterator I = Entries.find(Section); + if (I == Entries.end()) return false; + StringMap<Entry>::const_iterator II = I->second.find(Category); + if (II == I->second.end()) return false; + + return II->getValue().match(Query); +} + +} // namespace llvm diff --git a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp index 544c5ee..457fc80 100644 --- a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp +++ b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -22,14 +22,22 @@ using namespace llvm; // Out of line method to get vtable etc for class. void ValueMapTypeRemapper::anchor() {} +void ValueMaterializer::anchor() {} Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, - ValueMapTypeRemapper *TypeMapper) { + ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { ValueToValueMapTy::iterator I = VM.find(V); // If the value already exists in the map, use it. if (I != VM.end() && I->second) return I->second; + // If we have a materializer and it can materialize a value, use that. + if (Materializer) { + if (Value *NewV = Materializer->materializeValueFor(const_cast<Value*>(V))) + return VM[V] = NewV; + } + // Global values do not need to be seeded into the VM if they // are using the identity mapping. if (isa<GlobalValue>(V) || isa<MDString>(V)) @@ -64,7 +72,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i) { Value *OP = MD->getOperand(i); if (OP == 0) continue; - Value *Mapped_OP = MapValue(OP, VM, Flags, TypeMapper); + Value *Mapped_OP = MapValue(OP, VM, Flags, TypeMapper, Materializer); // Use identity map if Mapped_Op is null and we can ignore missing // entries. if (Mapped_OP == OP || @@ -79,7 +87,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, if (Op == 0) Elts.push_back(0); else { - Value *Mapped_Op = MapValue(Op, VM, Flags, TypeMapper); + Value *Mapped_Op = MapValue(Op, VM, Flags, TypeMapper, Materializer); // Use identity map if Mapped_Op is null and we can ignore missing // entries. if (Mapped_Op == 0 && (Flags & RF_IgnoreMissingEntries)) @@ -109,9 +117,9 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, if (BlockAddress *BA = dyn_cast<BlockAddress>(C)) { Function *F = - cast<Function>(MapValue(BA->getFunction(), VM, Flags, TypeMapper)); + cast<Function>(MapValue(BA->getFunction(), VM, Flags, TypeMapper, Materializer)); BasicBlock *BB = cast_or_null<BasicBlock>(MapValue(BA->getBasicBlock(), VM, - Flags, TypeMapper)); + Flags, TypeMapper, Materializer)); return VM[V] = BlockAddress::get(F, BB ? BB : BA->getBasicBlock()); } @@ -121,7 +129,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, Value *Mapped = 0; for (; OpNo != NumOperands; ++OpNo) { Value *Op = C->getOperand(OpNo); - Mapped = MapValue(Op, VM, Flags, TypeMapper); + Mapped = MapValue(Op, VM, Flags, TypeMapper, Materializer); if (Mapped != C) break; } @@ -149,7 +157,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, // Map the rest of the operands that aren't processed yet. for (++OpNo; OpNo != NumOperands; ++OpNo) Ops.push_back(MapValue(cast<Constant>(C->getOperand(OpNo)), VM, - Flags, TypeMapper)); + Flags, TypeMapper, Materializer)); } if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) @@ -173,10 +181,11 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, /// current values into those specified by VMap. /// void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap, - RemapFlags Flags, ValueMapTypeRemapper *TypeMapper){ + RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer){ // Remap operands. for (User::op_iterator op = I->op_begin(), E = I->op_end(); op != E; ++op) { - Value *V = MapValue(*op, VMap, Flags, TypeMapper); + Value *V = MapValue(*op, VMap, Flags, TypeMapper, Materializer); // If we aren't ignoring missing entries, assert that something happened. if (V != 0) *op = V; @@ -204,7 +213,7 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap, for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI) { MDNode *Old = MI->second; - MDNode *New = MapValue(Old, VMap, Flags, TypeMapper); + MDNode *New = MapValue(Old, VMap, Flags, TypeMapper, Materializer); if (New != Old) I->setMetadata(MI->first, New); } diff --git a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp index 17900da..c5e1dcb 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp @@ -356,7 +356,7 @@ namespace { Instruction *J, unsigned o, bool IBeforeJ); void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, - Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, + Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands, bool IBeforeJ); void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, @@ -533,7 +533,7 @@ namespace { default: break; case Instruction::GetElementPtr: // We mark this instruction as zero-cost because scalar GEPs are usually - // lowered to the intruction addressing mode. At the moment we don't + // lowered to the instruction addressing mode. At the moment we don't // generate vector GEPs. return 0; case Instruction::Br: @@ -625,10 +625,10 @@ namespace { ConstantInt *IntOff = ConstOffSCEV->getValue(); int64_t Offset = IntOff->getSExtValue(); - Type *VTy = cast<PointerType>(IPtr->getType())->getElementType(); + Type *VTy = IPtr->getType()->getPointerElementType(); int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy); - Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType(); + Type *VTy2 = JPtr->getType()->getPointerElementType(); if (VTy != VTy2 && Offset < 0) { int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2); OffsetInElmts = Offset/VTy2TSS; @@ -1182,6 +1182,8 @@ namespace { // Look for an instruction with which to pair instruction *I... DenseSet<Value *> Users; AliasSetTracker WriteSet(*AA); + if (I->mayWriteToMemory()) WriteSet.add(I); + bool JAfterStart = IAfterStart; BasicBlock::iterator J = llvm::next(I); for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { @@ -1403,6 +1405,8 @@ namespace { DenseSet<Value *> Users; AliasSetTracker WriteSet(*AA); + if (I->mayWriteToMemory()) WriteSet.add(I); + for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) { (void) trackUsesOfI(Users, WriteSet, I, J); @@ -1602,7 +1606,7 @@ namespace { DenseSet<ValuePair> CurrentPairs; bool CanAdd = true; - for (SmallVector<ValuePairWithDepth, 8>::iterator C2 + for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = BestChildren.begin(), E2 = BestChildren.end(); C2 != E2; ++C2) { if (C2->first.first == C->first.first || @@ -1642,7 +1646,7 @@ namespace { if (!CanAdd) continue; // And check the queue too... - for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(), + for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(), E2 = Q.end(); C2 != E2; ++C2) { if (C2->first.first == C->first.first || C2->first.first == C->first.second || @@ -1691,7 +1695,7 @@ namespace { // to an already-selected child. Check for this here, and if a // conflict is found, then remove the previously-selected child // before adding this one in its place. - for (SmallVector<ValuePairWithDepth, 8>::iterator C2 + for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = BestChildren.begin(); C2 != BestChildren.end();) { if (C2->first.first == C->first.first || C2->first.first == C->first.second || @@ -1706,7 +1710,7 @@ namespace { BestChildren.push_back(ValuePairWithDepth(C->first, C->second)); } - for (SmallVector<ValuePairWithDepth, 8>::iterator C + for (SmallVectorImpl<ValuePairWithDepth>::iterator C = BestChildren.begin(), E2 = BestChildren.end(); C != E2; ++C) { size_t DepthF = getDepthFactor(C->first.first); @@ -2227,11 +2231,12 @@ namespace { // The pointer value is taken to be the one with the lowest offset. Value *VPtr = IPtr; - Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType(); - Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType(); + Type *ArgTypeI = IPtr->getType()->getPointerElementType(); + Type *ArgTypeJ = JPtr->getType()->getPointerElementType(); Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); - Type *VArgPtrType = PointerType::get(VArgType, - cast<PointerType>(IPtr->getType())->getAddressSpace()); + Type *VArgPtrType + = PointerType::get(VArgType, + IPtr->getType()->getPointerAddressSpace()); return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), /* insert before */ I); } @@ -2240,7 +2245,7 @@ namespace { unsigned MaskOffset, unsigned NumInElem, unsigned NumInElem1, unsigned IdxOffset, std::vector<Constant*> &Mask) { - unsigned NumElem1 = cast<VectorType>(J->getType())->getNumElements(); + unsigned NumElem1 = J->getType()->getVectorNumElements(); for (unsigned v = 0; v < NumElem1; ++v) { int m = cast<ShuffleVectorInst>(J)->getMaskValue(v); if (m < 0) { @@ -2267,18 +2272,18 @@ namespace { Type *ArgTypeJ = J->getType(); Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); - unsigned NumElemI = cast<VectorType>(ArgTypeI)->getNumElements(); + unsigned NumElemI = ArgTypeI->getVectorNumElements(); // Get the total number of elements in the fused vector type. // By definition, this must equal the number of elements in // the final mask. - unsigned NumElem = cast<VectorType>(VArgType)->getNumElements(); + unsigned NumElem = VArgType->getVectorNumElements(); std::vector<Constant*> Mask(NumElem); Type *OpTypeI = I->getOperand(0)->getType(); - unsigned NumInElemI = cast<VectorType>(OpTypeI)->getNumElements(); + unsigned NumInElemI = OpTypeI->getVectorNumElements(); Type *OpTypeJ = J->getOperand(0)->getType(); - unsigned NumInElemJ = cast<VectorType>(OpTypeJ)->getNumElements(); + unsigned NumInElemJ = OpTypeJ->getVectorNumElements(); // The fused vector will be: // ----------------------------------------------------- @@ -2340,6 +2345,12 @@ namespace { return ExpandedIEChain; } + static unsigned getNumScalarElements(Type *Ty) { + if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) + return VecTy->getNumElements(); + return 1; + } + // Returns the value to be used as the specified operand of the vector // instruction that fuses I with J. Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, @@ -2355,17 +2366,8 @@ namespace { Instruction *L = I, *H = J; Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ; - unsigned numElemL; - if (ArgTypeL->isVectorTy()) - numElemL = cast<VectorType>(ArgTypeL)->getNumElements(); - else - numElemL = 1; - - unsigned numElemH; - if (ArgTypeH->isVectorTy()) - numElemH = cast<VectorType>(ArgTypeH)->getNumElements(); - else - numElemH = 1; + unsigned numElemL = getNumScalarElements(ArgTypeL); + unsigned numElemH = getNumScalarElements(ArgTypeH); Value *LOp = L->getOperand(o); Value *HOp = H->getOperand(o); @@ -2426,11 +2428,12 @@ namespace { if (CanUseInputs) { unsigned LOpElem = - cast<VectorType>(cast<Instruction>(LOp)->getOperand(0)->getType()) - ->getNumElements(); + cast<Instruction>(LOp)->getOperand(0)->getType() + ->getVectorNumElements(); + unsigned HOpElem = - cast<VectorType>(cast<Instruction>(HOp)->getOperand(0)->getType()) - ->getNumElements(); + cast<Instruction>(HOp)->getOperand(0)->getType() + ->getVectorNumElements(); // We have one or two input vectors. We need to map each index of the // operands to the index of the original vector. @@ -2646,14 +2649,14 @@ namespace { getReplacementName(IBeforeJ ? I : J, true, o, 1)); } - + NHOp->insertBefore(IBeforeJ ? J : I); HOp = NHOp; } } if (ArgType->isVectorTy()) { - unsigned numElem = cast<VectorType>(VArgType)->getNumElements(); + unsigned numElem = VArgType->getVectorNumElements(); std::vector<Constant*> Mask(numElem); for (unsigned v = 0; v < numElem; ++v) { unsigned Idx = v; @@ -2687,7 +2690,7 @@ namespace { // to the vector instruction that fuses I with J. void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, Instruction *I, Instruction *J, - SmallVector<Value *, 3> &ReplacedOperands, + SmallVectorImpl<Value *> &ReplacedOperands, bool IBeforeJ) { unsigned NumOperands = I->getNumOperands(); @@ -2746,16 +2749,8 @@ namespace { VectorType *VType = getVecTypeForPair(IType, JType); unsigned numElem = VType->getNumElements(); - unsigned numElemI, numElemJ; - if (IType->isVectorTy()) - numElemI = cast<VectorType>(IType)->getNumElements(); - else - numElemI = 1; - - if (JType->isVectorTy()) - numElemJ = cast<VectorType>(JType)->getNumElements(); - else - numElemJ = 1; + unsigned numElemI = getNumScalarElements(IType); + unsigned numElemJ = getNumScalarElements(JType); if (IType->isVectorTy()) { std::vector<Constant*> Mask1(numElemI), Mask2(numElemI); @@ -2804,6 +2799,8 @@ namespace { DenseSet<Value *> Users; AliasSetTracker WriteSet(*AA); + if (I->mayWriteToMemory()) WriteSet.add(I); + for (; cast<Instruction>(L) != J; ++L) (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs); @@ -2824,6 +2821,8 @@ namespace { DenseSet<Value *> Users; AliasSetTracker WriteSet(*AA); + if (I->mayWriteToMemory()) WriteSet.add(I); + for (; cast<Instruction>(L) != J;) { if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) { // Move this instruction @@ -2853,6 +2852,7 @@ namespace { DenseSet<Value *> Users; AliasSetTracker WriteSet(*AA); + if (I->mayWriteToMemory()) WriteSet.add(I); // Note: We cannot end the loop when we reach J because J could be moved // farther down the use chain by another instruction pairing. Also, J diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 08d3725..5e75871 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -47,13 +47,15 @@ #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" @@ -119,11 +121,14 @@ static const unsigned TinyTripCountUnrollThreshold = 128; /// than this number of comparisons. static const unsigned RuntimeMemoryCheckThreshold = 8; -/// We use a metadata with this name to indicate that a scalar loop was -/// vectorized and that we don't need to re-vectorize it if we run into it -/// again. -static const char* -AlreadyVectorizedMDName = "llvm.vectorizer.already_vectorized"; +/// Maximum simd width. +static const unsigned MaxVectorWidth = 64; + +/// Maximum vectorization unroll count. +static const unsigned MaxUnrollFactor = 16; + +/// The cost of a loop that is considered 'small' by the unroller. +static const unsigned SmallLoopCost = 20; namespace { @@ -166,7 +171,9 @@ public: updateAnalysis(); } -private: + virtual ~InnerLoopVectorizer() {} + +protected: /// A small list of PHINodes. typedef SmallVector<PHINode*, 4> PhiVector; /// When we unroll loops we have multiple vector values for each scalar. @@ -174,6 +181,11 @@ private: /// originated from one scalar instruction. typedef SmallVector<Value*, 2> VectorParts; + // When we if-convert we need create edge masks. We have to cache values so + // that we don't end up with exponential recursion/IR. + typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>, + VectorParts> EdgeMaskCache; + /// Add code that checks at runtime if the accessed arrays overlap. /// Returns the comparator value or NULL if no check is needed. Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal, @@ -181,7 +193,13 @@ private: /// Create an empty loop, based on the loop ranges of the old loop. void createEmptyLoop(LoopVectorizationLegality *Legal); /// Copy and widen the instructions from the old loop. - void vectorizeLoop(LoopVectorizationLegality *Legal); + virtual void vectorizeLoop(LoopVectorizationLegality *Legal); + + /// \brief The Loop exit block may have single value PHI nodes where the + /// incoming value is 'Undef'. While vectorizing we only handled real values + /// that were defined inside the loop. Here we fix the 'undef case'. + /// See PR14725. + void fixLCSSAPHIs(); /// A helper function that computes the predicate of the block BB, assuming /// that the header block of the loop is set to True. It returns the *entry* @@ -195,16 +213,23 @@ private: void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB, PhiVector *PV); + /// Vectorize a single PHINode in a block. This method handles the induction + /// variable canonicalization. It supports both VF = 1 for unrolled loops and + /// arbitrary length vectors. + void widenPHIInstruction(Instruction *PN, VectorParts &Entry, + LoopVectorizationLegality *Legal, + unsigned UF, unsigned VF, PhiVector *PV); + /// Insert the new loop to the loop hierarchy and pass manager /// and update the analysis passes. void updateAnalysis(); /// This instruction is un-vectorizable. Implement it as a sequence /// of scalars. - void scalarizeInstruction(Instruction *Instr); + virtual void scalarizeInstruction(Instruction *Instr); /// Vectorize Load and Store instructions, - void vectorizeMemoryInstruction(Instruction *Instr, + virtual void vectorizeMemoryInstruction(Instruction *Instr, LoopVectorizationLegality *Legal); /// Create a broadcast instruction. This method generates a broadcast @@ -212,12 +237,12 @@ private: /// value. If this is the induction variable then we extend it to N, N+1, ... /// this is needed because each iteration in the loop corresponds to a SIMD /// element. - Value *getBroadcastInstrs(Value *V); + virtual Value *getBroadcastInstrs(Value *V); /// This function adds 0, 1, 2 ... to each vector element, starting at zero. /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). /// The sequence starts at StartIndex. - Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); + virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); /// When we go over instructions in the basic block we rely on previous /// values within the current basic block or on loop invariant values. @@ -227,7 +252,7 @@ private: VectorParts &getVectorValue(Value *V); /// Generate a shuffle sequence that will reverse the vector Vec. - Value *reverseVector(Value *Vec); + virtual Value *reverseVector(Value *Vec); /// This is a helper class that holds the vectorizer state. It maps scalar /// instructions to vector instructions. When the code is 'unrolled' then @@ -285,6 +310,8 @@ private: /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. unsigned VF; + +protected: /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; @@ -313,10 +340,57 @@ private: PHINode *Induction; /// The induction variable of the old basic block. PHINode *OldInduction; + /// Holds the extended (to the widest induction type) start index. + Value *ExtendedIdx; /// Maps scalars to widened vectors. ValueMap WidenMap; + EdgeMaskCache MaskCache; }; +class InnerLoopUnroller : public InnerLoopVectorizer { +public: + InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, + DominatorTree *DT, DataLayout *DL, + const TargetLibraryInfo *TLI, unsigned UnrollFactor) : + InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { } + +private: + virtual void scalarizeInstruction(Instruction *Instr); + virtual void vectorizeMemoryInstruction(Instruction *Instr, + LoopVectorizationLegality *Legal); + virtual Value *getBroadcastInstrs(Value *V); + virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); + virtual Value *reverseVector(Value *Vec); +}; + +/// \brief Look for a meaningful debug location on the instruction or it's +/// operands. +static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { + if (!I) + return I; + + DebugLoc Empty; + if (I->getDebugLoc() != Empty) + return I; + + for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { + if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) + if (OpInst->getDebugLoc() != Empty) + return OpInst; + } + + return I; +} + +/// \brief Set the debug location in the builder using the debug location in the +/// instruction. +static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { + if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) + B.SetCurrentDebugLocation(Inst->getDebugLoc()); + else + B.SetCurrentDebugLocation(DebugLoc()); +} + /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and /// to what vectorization factor. /// This class does not look at the profitability of vectorization, only the @@ -333,10 +407,10 @@ private: class LoopVectorizationLegality { public: LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL, - DominatorTree *DT, TargetTransformInfo* TTI, - AliasAnalysis *AA, TargetLibraryInfo *TLI) - : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI), - Induction(0), HasFunNoNaNAttr(false) {} + DominatorTree *DT, TargetLibraryInfo *TLI) + : TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI), + Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false), + MaxSafeDepDistBytes(-1U) {} /// This enum represents the kinds of reductions that we support. enum ReductionKind { @@ -372,7 +446,7 @@ public: MRK_FloatMax }; - /// This POD struct holds information about reduction variables. + /// This struct holds information about reduction variables. struct ReductionDescriptor { ReductionDescriptor() : StartValue(0), LoopExitInstr(0), Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {} @@ -409,8 +483,8 @@ public: MinMaxReductionKind MinMaxKind; }; - // This POD struct holds information about the memory runtime legality - // check that a group of pointers do not overlap. + /// This struct holds information about the memory runtime legality + /// check that a group of pointers do not overlap. struct RuntimePointerCheck { RuntimePointerCheck() : Need(false) {} @@ -420,10 +494,13 @@ public: Pointers.clear(); Starts.clear(); Ends.clear(); + IsWritePtr.clear(); + DependencySetId.clear(); } /// Insert a pointer and calculate the start and end SCEVs. - void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr); + void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, + unsigned DepSetId); /// This flag indicates if we need to add the runtime check. bool Need; @@ -435,9 +512,12 @@ public: SmallVector<const SCEV*, 2> Ends; /// Holds the information if this pointer is used for writing to memory. SmallVector<bool, 2> IsWritePtr; + /// Holds the id of the set of pointers that could be dependent because of a + /// shared underlying object. + SmallVector<unsigned, 2> DependencySetId; }; - /// A POD for saving information about induction variables. + /// A struct for saving information about induction variables. struct InductionInfo { InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} InductionInfo() : StartValue(0), IK(IK_NoInduction) {} @@ -455,11 +535,6 @@ public: /// induction descriptor. typedef MapVector<PHINode*, InductionInfo> InductionList; - /// Alias(Multi)Map stores the values (GEPs or underlying objects and their - /// respective Store/Load instruction(s) to calculate aliasing. - typedef MapVector<Value*, Instruction* > AliasMap; - typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap; - /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this /// loop, only that it is legal to do so. @@ -474,6 +549,9 @@ public: /// Returns the induction variables found in the loop. InductionList *getInductionVars() { return &Inductions; } + /// Returns the widest induction type. + Type *getWidestInductionType() { return WidestIndTy; } + /// Returns True if V is an induction variable in this loop. bool isInductionVariable(const Value *V); @@ -503,6 +581,9 @@ public: /// This function returns the identity element (or neutral element) for /// the operation K. static Constant *getReductionIdentity(ReductionKind K, Type *Tp); + + unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -523,8 +604,9 @@ private: void collectLoopUniforms(); /// Return true if all of the instructions in the block can be speculatively - /// executed. - bool blockCanBePredicated(BasicBlock *BB); + /// executed. \p SafePtrs is a list of addresses that are known to be legal + /// and we know that we can read from them without segfault. + bool blockCanBePredicated(BasicBlock *BB, SmallPtrSet<Value *, 8>& SafePtrs); /// Returns True, if 'Phi' is the kind of reduction variable for type /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. @@ -543,16 +625,6 @@ private: /// Returns the induction kind of Phi. This function may return NoInduction /// if the PHI is not an induction variable. InductionKind isInductionVariable(PHINode *Phi); - /// Return true if can compute the address bounds of Ptr within the loop. - bool hasComputableBounds(Value *Ptr); - /// Return true if there is the chance of write reorder. - bool hasPossibleGlobalWriteReorder(Value *Object, - Instruction *Inst, - AliasMultiMap &WriteObjects, - unsigned MaxByteWidth); - /// Return the AA location for a load or a store. - AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst); - /// The loop that we evaluate. Loop *TheLoop; @@ -562,10 +634,6 @@ private: DataLayout *DL; /// Dominators. DominatorTree *DT; - /// Target Info. - TargetTransformInfo *TTI; - /// Alias Analysis. - AliasAnalysis *AA; /// Target Library Info. TargetLibraryInfo *TLI; @@ -580,6 +648,8 @@ private: /// Notice that inductions don't need to start at zero and that induction /// variables can be pointers. InductionList Inductions; + /// Holds the widest induction type encountered. + Type *WidestIndTy; /// Allowed outside users. This holds the reduction /// vars which can be accessed from outside the loop. @@ -592,6 +662,8 @@ private: RuntimePointerCheck PtrRtCheck; /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; + + unsigned MaxSafeDepDistBytes; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -684,12 +756,140 @@ private: const TargetLibraryInfo *TLI; }; +/// Utility class for getting and setting loop vectorizer hints in the form +/// of loop metadata. +struct LoopVectorizeHints { + /// Vectorization width. + unsigned Width; + /// Vectorization unroll factor. + unsigned Unroll; + + LoopVectorizeHints(const Loop *L, bool DisableUnrolling) + : Width(VectorizationFactor) + , Unroll(DisableUnrolling ? 1 : VectorizationUnroll) + , LoopID(L->getLoopID()) { + getHints(L); + // The command line options override any loop metadata except for when + // width == 1 which is used to indicate the loop is already vectorized. + if (VectorizationFactor.getNumOccurrences() > 0 && Width != 1) + Width = VectorizationFactor; + if (VectorizationUnroll.getNumOccurrences() > 0) + Unroll = VectorizationUnroll; + + DEBUG(if (DisableUnrolling && Unroll == 1) + dbgs() << "LV: Unrolling disabled by the pass manager\n"); + } + + /// Return the loop vectorizer metadata prefix. + static StringRef Prefix() { return "llvm.vectorizer."; } + + MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) { + SmallVector<Value*, 2> Vals; + Vals.push_back(MDString::get(Context, Name)); + Vals.push_back(ConstantInt::get(Type::getInt32Ty(Context), V)); + return MDNode::get(Context, Vals); + } + + /// Mark the loop L as already vectorized by setting the width to 1. + void setAlreadyVectorized(Loop *L) { + LLVMContext &Context = L->getHeader()->getContext(); + + Width = 1; + + // Create a new loop id with one more operand for the already_vectorized + // hint. If the loop already has a loop id then copy the existing operands. + SmallVector<Value*, 4> Vals(1); + if (LoopID) + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) + Vals.push_back(LoopID->getOperand(i)); + + Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width)); + Vals.push_back(createHint(Context, Twine(Prefix(), "unroll").str(), 1)); + + MDNode *NewLoopID = MDNode::get(Context, Vals); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + + L->setLoopID(NewLoopID); + if (LoopID) + LoopID->replaceAllUsesWith(NewLoopID); + + LoopID = NewLoopID; + } + +private: + MDNode *LoopID; + + /// Find hints specified in the loop metadata. + void getHints(const Loop *L) { + if (!LoopID) + return; + + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + const MDString *S = 0; + SmallVector<Value*, 4> Args; + + // The expected hint is either a MDString or a MDNode with the first + // operand a MDString. + if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { + if (!MD || MD->getNumOperands() == 0) + continue; + S = dyn_cast<MDString>(MD->getOperand(0)); + for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) + Args.push_back(MD->getOperand(i)); + } else { + S = dyn_cast<MDString>(LoopID->getOperand(i)); + assert(Args.size() == 0 && "too many arguments for MDString"); + } + + if (!S) + continue; + + // Check if the hint starts with the vectorizer prefix. + StringRef Hint = S->getString(); + if (!Hint.startswith(Prefix())) + continue; + // Remove the prefix. + Hint = Hint.substr(Prefix().size(), StringRef::npos); + + if (Args.size() == 1) + getHint(Hint, Args[0]); + } + } + + // Check string hint with one operand. + void getHint(StringRef Hint, Value *Arg) { + const ConstantInt *C = dyn_cast<ConstantInt>(Arg); + if (!C) return; + unsigned Val = C->getZExtValue(); + + if (Hint == "width") { + if (isPowerOf2_32(Val) && Val <= MaxVectorWidth) + Width = Val; + else + DEBUG(dbgs() << "LV: ignoring invalid width hint metadata\n"); + } else if (Hint == "unroll") { + if (isPowerOf2_32(Val) && Val <= MaxUnrollFactor) + Unroll = Val; + else + DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n"); + } else { + DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n'); + } + } +}; + /// The LoopVectorize Pass. struct LoopVectorize : public LoopPass { /// Pass identification, replacement for typeid static char ID; - explicit LoopVectorize() : LoopPass(ID) { + explicit LoopVectorize(bool NoUnrolling = false) + : LoopPass(ID), DisableUnrolling(NoUnrolling) { initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); } @@ -698,8 +898,8 @@ struct LoopVectorize : public LoopPass { LoopInfo *LI; TargetTransformInfo *TTI; DominatorTree *DT; - AliasAnalysis *AA; TargetLibraryInfo *TLI; + bool DisableUnrolling; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { // We only vectorize innermost loops. @@ -711,19 +911,30 @@ struct LoopVectorize : public LoopPass { LI = &getAnalysis<LoopInfo>(); TTI = &getAnalysis<TargetTransformInfo>(); DT = &getAnalysis<DominatorTree>(); - AA = getAnalysisIfAvailable<AliasAnalysis>(); TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + // If the target claims to have no vector registers don't attempt + // vectorization. + if (!TTI->getNumberOfRegisters(true)) + return false; + if (DL == NULL) { - DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout"); + DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout\n"); return false; } DEBUG(dbgs() << "LV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); + LoopVectorizeHints Hints(L, DisableUnrolling); + + if (Hints.Width == 1 && Hints.Unroll == 1) { + DEBUG(dbgs() << "LV: Not vectorizing.\n"); + return false; + } + // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI); + LoopVectorizationLegality LVL(L, SE, DL, DT, TLI); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing.\n"); return false; @@ -749,23 +960,30 @@ struct LoopVectorize : public LoopPass { // Select the optimal vectorization factor. LoopVectorizationCostModel::VectorizationFactor VF; - VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); + VF = CM.selectVectorizationFactor(OptForSize, Hints.Width); // Select the unroll factor. - unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll, - VF.Width, VF.Cost); + unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width, + VF.Cost); + + DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<< + F->getParent()->getModuleIdentifier() << '\n'); + DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n'); if (VF.Width == 1) { DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); - return false; + if (UF == 1) + return false; + // We decided not to vectorize, but we may want to unroll. + InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF); + Unroller.vectorize(&LVL); + } else { + // If we decided that it is *legal* to vectorize the loop then do it. + InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); + LB.vectorize(&LVL); } - DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<< - F->getParent()->getModuleIdentifier()<<"\n"); - DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n"); - - // If we decided that it is *legal* to vectorize the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); - LB.vectorize(&LVL); + // Mark the loop as already vectorized to avoid vectorizing again. + Hints.setAlreadyVectorized(L); DEBUG(verifyFunction(*L->getHeader()->getParent())); return true; @@ -795,38 +1013,34 @@ struct LoopVectorize : public LoopPass { void LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, - bool WritePtr) { + bool WritePtr, + unsigned DepSetId) { const SCEV *Sc = SE->getSCEV(Ptr); const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); assert(AR && "Invalid addrec expression"); - const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch()); + const SCEV *Ex = SE->getBackedgeTakenCount(Lp); const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); Pointers.push_back(Ptr); Starts.push_back(AR->getStart()); Ends.push_back(ScEnd); IsWritePtr.push_back(WritePtr); + DependencySetId.push_back(DepSetId); } Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { - // Save the current insertion location. - Instruction *Loc = Builder.GetInsertPoint(); - // We need to place the broadcast of invariant variables outside the loop. Instruction *Instr = dyn_cast<Instruction>(V); bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; // Place the code for broadcasting invariant variables in the new preheader. + IRBuilder<>::InsertPointGuard Guard(Builder); if (Invariant) Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); // Broadcast the scalar into all locations in the vector. Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); - // Restore the builder insertion point. - if (Invariant) - Builder.SetInsertPoint(Loc); - return Shuf; } @@ -853,10 +1067,35 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, return Builder.CreateAdd(Val, Cv, "induction"); } +/// \brief Find the operand of the GEP that should be checked for consecutive +/// stores. This ignores trailing indices that have no effect on the final +/// pointer. +static unsigned getGEPInductionOperand(DataLayout *DL, + const GetElementPtrInst *Gep) { + unsigned LastOperand = Gep->getNumOperands() - 1; + unsigned GEPAllocSize = DL->getTypeAllocSize( + cast<PointerType>(Gep->getType()->getScalarType())->getElementType()); + + // Walk backwards and try to peel off zeros. + while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) { + // Find the type we're currently indexing into. + gep_type_iterator GEPTI = gep_type_begin(Gep); + std::advance(GEPTI, LastOperand - 1); + + // If it's a type with the same allocation size as the result of the GEP we + // can peel off the zero index. + if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize) + break; + --LastOperand; + } + + return LastOperand; +} + int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); // Make sure that the pointer does not point to structs. - if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType()) + if (Ptr->getType()->getPointerElementType()->isAggregateType()) return 0; // If this value is a pointer induction variable we know it is consecutive. @@ -874,8 +1113,6 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { return 0; unsigned NumOperands = Gep->getNumOperands(); - Value *LastIndex = Gep->getOperand(NumOperands - 1); - Value *GpPtr = Gep->getPointerOperand(); // If this GEP value is a consecutive pointer induction variable and all of // the indices are constant then we know it is consecutive. We can @@ -899,14 +1136,18 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { return -1; } - // Check that all of the gep indices are uniform except for the last. - for (unsigned i = 0; i < NumOperands - 1; ++i) - if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) + unsigned InductionOperand = getGEPInductionOperand(DL, Gep); + + // Check that all of the gep indices are uniform except for our induction + // operand. + for (unsigned i = 0; i != NumOperands; ++i) + if (i != InductionOperand && + !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) return 0; - // We can emit wide load/stores only if the last index is the induction - // variable. - const SCEV *Last = SE->getSCEV(LastIndex); + // We can emit wide load/stores only if the last non-zero index is the + // induction variable. + const SCEV *Last = SE->getSCEV(Gep->getOperand(InductionOperand)); if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { const SCEV *Step = AR->getStepRecurrence(*SE); @@ -964,7 +1205,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Type *DataTy = VectorType::get(ScalarDataTy, VF); Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); - + // An alignment of 0 means target abi alignment. We need to use the scalar's + // target abi alignment in such a case. + if (!Alignment) + Alignment = DL->getABITypeAlignment(ScalarDataTy); + unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; @@ -985,6 +1230,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, // Handle consecutive loads/stores. GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { + setDebugLocFromInst(Builder, Gep); Value *PtrOperand = Gep->getPointerOperand(); Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); @@ -995,26 +1241,40 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Gep2->setName("gep.indvar.base"); Ptr = Builder.Insert(Gep2); } else if (Gep) { + setDebugLocFromInst(Builder, Gep); assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), OrigLoop) && "Base ptr must be invariant"); // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; unsigned NumOperands = Gep->getNumOperands(); - - Value *LastGepOperand = Gep->getOperand(NumOperands - 1); - VectorParts &GEPParts = getVectorValue(LastGepOperand); - Value *LastIndex = GEPParts[0]; - LastIndex = Builder.CreateExtractElement(LastIndex, Zero); - + unsigned InductionOperand = getGEPInductionOperand(DL, Gep); // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); - Gep2->setOperand(NumOperands - 1, LastIndex); - Gep2->setName("gep.indvar.idx"); + + for (unsigned i = 0; i < NumOperands; ++i) { + Value *GepOperand = Gep->getOperand(i); + Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand); + + // Update last index or loop invariant instruction anchored in loop. + if (i == InductionOperand || + (GepOperandInst && OrigLoop->contains(GepOperandInst))) { + assert((i == InductionOperand || + SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && + "Must be last index or loop invariant"); + + VectorParts &GEPParts = getVectorValue(GepOperand); + Value *Index = GEPParts[0]; + Index = Builder.CreateExtractElement(Index, Zero); + Gep2->setOperand(i, Index); + Gep2->setName("gep.indvar.idx"); + } + } Ptr = Builder.Insert(Gep2); } else { // Use the induction element ptr. assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); + setDebugLocFromInst(Builder, Ptr); VectorParts &PtrVal = getVectorValue(Ptr); Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); } @@ -1023,8 +1283,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, if (SI) { assert(!Legal->isUniform(SI->getPointerOperand()) && "We do not allow storing to uniform addresses"); + setDebugLocFromInst(Builder, SI); + // We don't want to update the value in the map as it might be used in + // another expression. So don't use a reference type for "StoredVal". + VectorParts StoredVal = getVectorValue(SI->getValueOperand()); - VectorParts &StoredVal = getVectorValue(SI->getValueOperand()); for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); @@ -1039,11 +1302,16 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); } - Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo()); + Value *VecPtr = Builder.CreateBitCast(PartPtr, + DataTy->getPointerTo(AddressSpace)); Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment); } + return; } + // Handle loads. + assert(LI && "Must have a load instruction"); + setDebugLocFromInst(Builder, LI); for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); @@ -1055,7 +1323,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); } - Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo()); + Value *VecPtr = Builder.CreateBitCast(PartPtr, + DataTy->getPointerTo(AddressSpace)); Value *LI = Builder.CreateLoad(VecPtr, "wide.load"); cast<LoadInst>(LI)->setAlignment(Alignment); Entry[Part] = Reverse ? reverseVector(LI) : LI; @@ -1067,6 +1336,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { // Holds vector parameters or scalars, in case of uniform vals. SmallVector<VectorParts, 4> Params; + setDebugLocFromInst(Builder, Instr); + // Find all of the vectorized parameters. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { Value *SrcOp = Instr->getOperand(op); @@ -1112,7 +1383,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); - // Replace the operands of the cloned instrucions with extracted scalars. + // Replace the operands of the cloned instructions with extracted scalars. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { Value *Op = Params[op][Part]; // Param is a vector. Need to extract the right lane. @@ -1142,16 +1413,13 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, if (!PtrRtCheck->Need) return NULL; - Instruction *MemoryRuntimeCheck = 0; unsigned NumPointers = PtrRtCheck->Pointers.size(); - SmallVector<Value* , 2> Starts; - SmallVector<Value* , 2> Ends; + SmallVector<TrackingVH<Value> , 2> Starts; + SmallVector<TrackingVH<Value> , 2> Ends; + LLVMContext &Ctx = Loc->getContext(); SCEVExpander Exp(*SE, "induction"); - // Use this type for pointer arithmetic. - Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0); - for (unsigned i = 0; i < NumPointers; ++i) { Value *Ptr = PtrRtCheck->Pointers[i]; const SCEV *Sc = SE->getSCEV(Ptr); @@ -1162,7 +1430,11 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, Starts.push_back(Ptr); Ends.push_back(Ptr); } else { - DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); + DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n'); + unsigned AS = Ptr->getType()->getPointerAddressSpace(); + + // Use this type for pointer arithmetic. + Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); @@ -1172,17 +1444,32 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, } IRBuilder<> ChkBuilder(Loc); - + // Our instructions might fold to a constant. + Value *MemoryRuntimeCheck = 0; for (unsigned i = 0; i < NumPointers; ++i) { for (unsigned j = i+1; j < NumPointers; ++j) { // No need to check if two readonly pointers intersect. if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) continue; - Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc"); - Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc"); - Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc"); - Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy, "bc"); + // Only need to check pointers between two different dependency sets. + if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) + continue; + + unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); + unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); + + assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && + (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && + "Trying to bounds check pointers with different address spaces"); + + Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); + Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); + + Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc"); + Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc"); + Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc"); + Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc"); Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); @@ -1190,12 +1477,17 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, if (MemoryRuntimeCheck) IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx"); - - MemoryRuntimeCheck = cast<Instruction>(IsConflict); + MemoryRuntimeCheck = IsConflict; } } - return MemoryRuntimeCheck; + // We have to do this trickery because the IRBuilder might fold the check to a + // constant expression in which case there is no Instruction anchored in a + // the block. + Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, + ConstantInt::getTrue(Ctx)); + ChkBuilder.Insert(Check, "memcheck.conflict"); + return Check; } void @@ -1234,23 +1526,27 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { BasicBlock *ExitBlock = OrigLoop->getExitBlock(); assert(ExitBlock && "Must have an exit block"); - // Mark the old scalar loop with metadata that tells us not to vectorize this - // loop again if we run into it. - MDNode *MD = MDNode::get(OldBasicBlock->getContext(), None); - OldBasicBlock->getTerminator()->setMetadata(AlreadyVectorizedMDName, MD); - // Some loops have a single integer induction variable, while other loops // don't. One example is c++ iterators that often have multiple pointer // induction variables. In the code below we also support a case where we // don't have a single induction variable. OldInduction = Legal->getInduction(); - Type *IdxTy = OldInduction ? OldInduction->getType() : - DL->getIntPtrType(SE->getContext()); + Type *IdxTy = Legal->getWidestInductionType(); // Find the loop boundaries. - const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch()); + const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop); assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); + // The exit count might have the type of i64 while the phi is i32. This can + // happen if we have an induction variable that is sign extended before the + // compare. The only way that we get a backedge taken count is that the + // induction variable was signed and as such will not overflow. In such a case + // truncation is legal. + if (ExitCount->getType()->getPrimitiveSizeInBits() > + IdxTy->getPrimitiveSizeInBits()) + ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); + + ExitCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); // Get the total trip count from the count by adding 1. ExitCount = SE->getAddExpr(ExitCount, SE->getConstant(ExitCount->getType(), 1)); @@ -1266,9 +1562,11 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // The loop index does not have to start at Zero. Find the original start // value from the induction PHI node. If we don't have an induction variable // then we know that it starts at zero. - Value *StartIdx = OldInduction ? - OldInduction->getIncomingValueForBlock(BypassBlock): - ConstantInt::get(IdxTy, 0); + Builder.SetInsertPoint(BypassBlock->getTerminator()); + Value *StartIdx = ExtendedIdx = OldInduction ? + Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock), + IdxTy): + ConstantInt::get(IdxTy, 0); assert(BypassBlock && "Invalid loop structure"); LoopBypassBlocks.push_back(BypassBlock); @@ -1283,11 +1581,28 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { BasicBlock *ScalarPH = MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); + // Create and register the new vector loop. + Loop* Lp = new Loop(); + Loop *ParentLoop = OrigLoop->getParentLoop(); + + // Insert the new loop into the loop nest and register the new basic blocks + // before calling any utilities such as SCEV that require valid LoopInfo. + if (ParentLoop) { + ParentLoop->addChildLoop(Lp); + ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); + ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); + ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); + } else { + LI->addTopLevelLoop(Lp); + } + Lp->addBasicBlockToLoop(VecBody, LI->getBase()); + // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. - Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder.SetInsertPoint(VecBody->getFirstNonPHI()); // Generate the induction variable. + setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); Induction = Builder.CreatePHI(IdxTy, 2, "index"); // The loop step is equal to the vectorization factor (num of SIMD elements) // times the unroll factor (num of SIMD instructions). @@ -1296,6 +1611,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // This is the IR builder that we use to add all of the logic for bypassing // the new vector loop. IRBuilder<> BypassBuilder(BypassBlock->getTerminator()); + setDebugLocFromInst(BypassBuilder, + getDebugLocFromInstOrOperands(OldInduction)); // We may need to extend the index in case there is a type mismatch. // We know that the count starts at zero and does not overflow. @@ -1334,6 +1651,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Create a new block containing the memory check. BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck, "vector.memcheck"); + if (ParentLoop) + ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); LoopBypassBlocks.push_back(CheckBlock); // Replace the branch into the memory check block with a conditional branch @@ -1362,76 +1681,101 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { PHINode *ResumeIndex = 0; LoopVectorizationLegality::InductionList::iterator I, E; LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); + // Set builder to point to last bypass block. + BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator()); for (I = List->begin(), E = List->end(); I != E; ++I) { PHINode *OrigPhi = I->first; LoopVectorizationLegality::InductionInfo II = I->second; - PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", + + Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType(); + PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val", MiddleBlock->getTerminator()); + // We might have extended the type of the induction variable but we need a + // truncated version for the scalar loop. + PHINode *TruncResumeVal = (OrigPhi == OldInduction) ? + PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", + MiddleBlock->getTerminator()) : 0; + Value *EndValue = 0; switch (II.IK) { case LoopVectorizationLegality::IK_NoInduction: llvm_unreachable("Unknown induction"); case LoopVectorizationLegality::IK_IntInduction: { - // Handle the integer induction counter: + // Handle the integer induction counter. assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); - assert(OrigPhi == OldInduction && "Unknown integer PHI"); - // We know what the end value is. - EndValue = IdxEndRoundDown; - // We also know which PHI node holds it. - ResumeIndex = ResumeVal; + + // We have the canonical induction variable. + if (OrigPhi == OldInduction) { + // Create a truncated version of the resume value for the scalar loop, + // we might have promoted the type to a larger width. + EndValue = + BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); + // The new PHI merges the original incoming value, in case of a bypass, + // or the value at the end of the vectorized loop. + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); + TruncResumeVal->addIncoming(EndValue, VecBody); + + // We know what the end value is. + EndValue = IdxEndRoundDown; + // We also know which PHI node holds it. + ResumeIndex = ResumeVal; + break; + } + + // Not the canonical induction variable - add the vector loop count to the + // start value. + Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, + II.StartValue->getType(), + "cast.crd"); + EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end"); break; } case LoopVectorizationLegality::IK_ReverseIntInduction: { // Convert the CountRoundDown variable to the PHI size. - unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits(); - unsigned IISize = II.StartValue->getType()->getScalarSizeInBits(); - Value *CRD = CountRoundDown; - if (CRDSize > IISize) - CRD = CastInst::Create(Instruction::Trunc, CountRoundDown, - II.StartValue->getType(), "tr.crd", - LoopBypassBlocks.back()->getTerminator()); - else if (CRDSize < IISize) - CRD = CastInst::Create(Instruction::SExt, CountRoundDown, - II.StartValue->getType(), - "sext.crd", - LoopBypassBlocks.back()->getTerminator()); - // Handle reverse integer induction counter: - EndValue = - BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end", - LoopBypassBlocks.back()->getTerminator()); + Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, + II.StartValue->getType(), + "cast.crd"); + // Handle reverse integer induction counter. + EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end"); break; } case LoopVectorizationLegality::IK_PtrInduction: { // For pointer induction variables, calculate the offset using // the end index. - EndValue = - GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end", - LoopBypassBlocks.back()->getTerminator()); + EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown, + "ptr.ind.end"); break; } case LoopVectorizationLegality::IK_ReversePtrInduction: { // The value at the end of the loop for the reverse pointer is calculated // by creating a GEP with a negative index starting from the start value. Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0); - Value *NegIdx = BinaryOperator::CreateSub(Zero, CountRoundDown, - "rev.ind.end", - LoopBypassBlocks.back()->getTerminator()); - EndValue = GetElementPtrInst::Create(II.StartValue, NegIdx, - "rev.ptr.ind.end", - LoopBypassBlocks.back()->getTerminator()); + Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown, + "rev.ind.end"); + EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx, + "rev.ptr.ind.end"); break; } }// end of case // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) - ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) { + if (OrigPhi == OldInduction) + ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); + else + ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); + } ResumeVal->addIncoming(EndValue, VecBody); // Fix the scalar body counter (PHI node). unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); - OrigPhi->setIncomingValue(BlockIdx, ResumeVal); + // The old inductions phi node in the scalar body needs the truncated value. + if (OrigPhi == OldInduction) + OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal); + else + OrigPhi->setIncomingValue(BlockIdx, ResumeVal); } // If we are generating a new induction variable then we also need to @@ -1476,24 +1820,6 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Get ready to start creating new instructions into the vectorized body. Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); - // Create and register the new vector loop. - Loop* Lp = new Loop(); - Loop *ParentLoop = OrigLoop->getParentLoop(); - - // Insert the new loop into the loop nest and register the new basic blocks. - if (ParentLoop) { - ParentLoop->addChildLoop(Lp); - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase()); - ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); - ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); - ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); - } else { - LI->addTopLevelLoop(Lp); - } - - Lp->addBasicBlockToLoop(VecBody, LI->getBase()); - // Save the state. LoopVectorPreHeader = VectorPH; LoopScalarPreHeader = ScalarPH; @@ -1501,6 +1827,9 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { LoopExitBlock = ExitBlock; LoopVectorBody = VecBody; LoopScalarBody = OldBasicBlock; + + LoopVectorizeHints Hints(Lp, true); + Hints.setAlreadyVectorized(Lp); } /// This function returns the identity element (or neutral element) for @@ -1530,6 +1859,31 @@ LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) { } } +static Intrinsic::ID checkUnaryFloatSignature(const CallInst &I, + Intrinsic::ID ValidIntrinsicID) { + if (I.getNumArgOperands() != 1 || + !I.getArgOperand(0)->getType()->isFloatingPointTy() || + I.getType() != I.getArgOperand(0)->getType() || + !I.onlyReadsMemory()) + return Intrinsic::not_intrinsic; + + return ValidIntrinsicID; +} + +static Intrinsic::ID checkBinaryFloatSignature(const CallInst &I, + Intrinsic::ID ValidIntrinsicID) { + if (I.getNumArgOperands() != 2 || + !I.getArgOperand(0)->getType()->isFloatingPointTy() || + !I.getArgOperand(1)->getType()->isFloatingPointTy() || + I.getType() != I.getArgOperand(0)->getType() || + I.getType() != I.getArgOperand(1)->getType() || + !I.onlyReadsMemory()) + return Intrinsic::not_intrinsic; + + return ValidIntrinsicID; +} + + static Intrinsic::ID getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { // If we have an intrinsic call, check if it is trivially vectorizable. @@ -1544,14 +1898,18 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { case Intrinsic::log10: case Intrinsic::log2: case Intrinsic::fabs: + case Intrinsic::copysign: case Intrinsic::floor: case Intrinsic::ceil: case Intrinsic::trunc: case Intrinsic::rint: case Intrinsic::nearbyint: + case Intrinsic::round: case Intrinsic::pow: case Intrinsic::fma: case Intrinsic::fmuladd: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: return II->getIntrinsicID(); default: return Intrinsic::not_intrinsic; @@ -1564,8 +1922,9 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { LibFunc::Func Func; Function *F = CI->getCalledFunction(); // We're going to make assumptions on the semantics of the functions, check - // that the target knows that it's available in this environment. - if (!F || !TLI->getLibFunc(F->getName(), Func)) + // that the target knows that it's available in this environment and it does + // not have local linkage. + if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(F->getName(), Func)) return Intrinsic::not_intrinsic; // Otherwise check if we have a call to a function that can be turned into a @@ -1576,59 +1935,67 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { case LibFunc::sin: case LibFunc::sinf: case LibFunc::sinl: - return Intrinsic::sin; + return checkUnaryFloatSignature(*CI, Intrinsic::sin); case LibFunc::cos: case LibFunc::cosf: case LibFunc::cosl: - return Intrinsic::cos; + return checkUnaryFloatSignature(*CI, Intrinsic::cos); case LibFunc::exp: case LibFunc::expf: case LibFunc::expl: - return Intrinsic::exp; + return checkUnaryFloatSignature(*CI, Intrinsic::exp); case LibFunc::exp2: case LibFunc::exp2f: case LibFunc::exp2l: - return Intrinsic::exp2; + return checkUnaryFloatSignature(*CI, Intrinsic::exp2); case LibFunc::log: case LibFunc::logf: case LibFunc::logl: - return Intrinsic::log; + return checkUnaryFloatSignature(*CI, Intrinsic::log); case LibFunc::log10: case LibFunc::log10f: case LibFunc::log10l: - return Intrinsic::log10; + return checkUnaryFloatSignature(*CI, Intrinsic::log10); case LibFunc::log2: case LibFunc::log2f: case LibFunc::log2l: - return Intrinsic::log2; + return checkUnaryFloatSignature(*CI, Intrinsic::log2); case LibFunc::fabs: case LibFunc::fabsf: case LibFunc::fabsl: - return Intrinsic::fabs; + return checkUnaryFloatSignature(*CI, Intrinsic::fabs); + case LibFunc::copysign: + case LibFunc::copysignf: + case LibFunc::copysignl: + return checkBinaryFloatSignature(*CI, Intrinsic::copysign); case LibFunc::floor: case LibFunc::floorf: case LibFunc::floorl: - return Intrinsic::floor; + return checkUnaryFloatSignature(*CI, Intrinsic::floor); case LibFunc::ceil: case LibFunc::ceilf: case LibFunc::ceill: - return Intrinsic::ceil; + return checkUnaryFloatSignature(*CI, Intrinsic::ceil); case LibFunc::trunc: case LibFunc::truncf: case LibFunc::truncl: - return Intrinsic::trunc; + return checkUnaryFloatSignature(*CI, Intrinsic::trunc); case LibFunc::rint: case LibFunc::rintf: case LibFunc::rintl: - return Intrinsic::rint; + return checkUnaryFloatSignature(*CI, Intrinsic::rint); case LibFunc::nearbyint: case LibFunc::nearbyintf: case LibFunc::nearbyintl: - return Intrinsic::nearbyint; + return checkUnaryFloatSignature(*CI, Intrinsic::nearbyint); + case LibFunc::round: + case LibFunc::roundf: + case LibFunc::roundl: + return checkUnaryFloatSignature(*CI, Intrinsic::round); case LibFunc::pow: case LibFunc::powf: case LibFunc::powl: - return Intrinsic::pow; + return checkBinaryFloatSignature(*CI, Intrinsic::pow); } return Intrinsic::not_intrinsic; @@ -1690,7 +2057,8 @@ Value *createMinMaxOp(IRBuilder<> &Builder, } Value *Cmp; - if (RK == LoopVectorizationLegality::MRK_FloatMin || RK == LoopVectorizationLegality::MRK_FloatMax) + if (RK == LoopVectorizationLegality::MRK_FloatMin || + RK == LoopVectorizationLegality::MRK_FloatMax) Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); else Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); @@ -1699,6 +2067,54 @@ Value *createMinMaxOp(IRBuilder<> &Builder, return Select; } +namespace { +struct CSEDenseMapInfo { + static bool canHandle(Instruction *I) { + return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || + isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); + } + static inline Instruction *getEmptyKey() { + return DenseMapInfo<Instruction *>::getEmptyKey(); + } + static inline Instruction *getTombstoneKey() { + return DenseMapInfo<Instruction *>::getTombstoneKey(); + } + static unsigned getHashValue(Instruction *I) { + assert(canHandle(I) && "Unknown instruction!"); + return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), + I->value_op_end())); + } + static bool isEqual(Instruction *LHS, Instruction *RHS) { + if (LHS == getEmptyKey() || RHS == getEmptyKey() || + LHS == getTombstoneKey() || RHS == getTombstoneKey()) + return LHS == RHS; + return LHS->isIdenticalTo(RHS); + } +}; +} + +///\brief Perform cse of induction variable instructions. +static void cse(BasicBlock *BB) { + // Perform simple cse. + SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { + Instruction *In = I++; + + if (!CSEDenseMapInfo::canHandle(In)) + continue; + + // Check if we can replace this instruction with any of the + // visited instructions. + if (Instruction *V = CSEMap.lookup(In)) { + In->replaceAllUsesWith(V); + In->eraseFromParent(); + continue; + } + + CSEMap[In] = In; + } +} + void InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { //===------------------------------------------------===// @@ -1750,6 +2166,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { LoopVectorizationLegality::ReductionDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; + setDebugLocFromInst(Builder, RdxDesc.StartValue); + // We need to generate a reduction vector from the incoming scalar. // To do so, we need to generate the 'identity' vector and overide // one of the elements with the incoming scalar reduction. We need @@ -1767,18 +2185,31 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax || RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) { // MinMax reduction have the start value as their identify. - VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue, - "minmax.ident"); + if (VF == 1) { + VectorStart = Identity = RdxDesc.StartValue; + } else { + VectorStart = Identity = Builder.CreateVectorSplat(VF, + RdxDesc.StartValue, + "minmax.ident"); + } } else { + // Handle other reduction kinds: Constant *Iden = - LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, - VecTy->getScalarType()); - Identity = ConstantVector::getSplat(VF, Iden); - - // This vector is the Identity vector where the first element is the - // incoming scalar reduction. - VectorStart = Builder.CreateInsertElement(Identity, - RdxDesc.StartValue, Zero); + LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, + VecTy->getScalarType()); + if (VF == 1) { + Identity = Iden; + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + VectorStart = RdxDesc.StartValue; + } else { + Identity = ConstantVector::getSplat(VF, Iden); + + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + VectorStart = Builder.CreateInsertElement(Identity, + RdxDesc.StartValue, Zero); + } } // Fix the vector-loop phi. @@ -1793,7 +2224,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch); VectorParts &Val = getVectorValue(LoopVal); for (unsigned part = 0; part < UF; ++part) { - // Make sure to add the reduction stat value only to the + // Make sure to add the reduction stat value only to the // first unroll part. Value *StartVal = (part == 0) ? VectorStart : Identity; cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader); @@ -1807,6 +2238,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); VectorParts RdxParts; + setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr); for (unsigned part = 0; part < UF; ++part) { // This PHINode contains the vectorized reduction variable, or // the initial value vector, if we bypass the vector loop. @@ -1822,6 +2254,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Reduce all of the unrolled parts into a single vector. Value *ReducedPartRdx = RdxParts[0]; unsigned Op = getReductionBinOp(RdxDesc.Kind); + setDebugLocFromInst(Builder, ReducedPartRdx); for (unsigned part = 1; part < UF; ++part) { if (Op != Instruction::ICmp && Op != Instruction::FCmp) ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op, @@ -1832,37 +2265,40 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { ReducedPartRdx, RdxParts[part]); } - // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles - // and vector ops, reducing the set of values being computed by half each - // round. - assert(isPowerOf2_32(VF) && - "Reduction emission only supported for pow2 vectors!"); - Value *TmpVec = ReducedPartRdx; - SmallVector<Constant*, 32> ShuffleMask(VF, 0); - for (unsigned i = VF; i != 1; i >>= 1) { - // Move the upper half of the vector to the lower half. - for (unsigned j = 0; j != i/2; ++j) - ShuffleMask[j] = Builder.getInt32(i/2 + j); - - // Fill the rest of the mask with undef. - std::fill(&ShuffleMask[i/2], ShuffleMask.end(), - UndefValue::get(Builder.getInt32Ty())); - - Value *Shuf = + if (VF > 1) { + // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles + // and vector ops, reducing the set of values being computed by half each + // round. + assert(isPowerOf2_32(VF) && + "Reduction emission only supported for pow2 vectors!"); + Value *TmpVec = ReducedPartRdx; + SmallVector<Constant*, 32> ShuffleMask(VF, 0); + for (unsigned i = VF; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i/2; ++j) + ShuffleMask[j] = Builder.getInt32(i/2 + j); + + // Fill the rest of the mask with undef. + std::fill(&ShuffleMask[i/2], ShuffleMask.end(), + UndefValue::get(Builder.getInt32Ty())); + + Value *Shuf = Builder.CreateShuffleVector(TmpVec, UndefValue::get(TmpVec->getType()), ConstantVector::get(ShuffleMask), "rdx.shuf"); - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, - "bin.rdx"); - else - TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); - } + if (Op != Instruction::ICmp && Op != Instruction::FCmp) + TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, + "bin.rdx"); + else + TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); + } - // The result is in the first element of the vector. - Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + // The result is in the first element of the vector. + ReducedPartRdx = Builder.CreateExtractElement(TmpVec, + Builder.getInt32(0)); + } // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. @@ -1871,7 +2307,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { for (BasicBlock::iterator LEI = LoopExitBlock->begin(), LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); - if (!LCSSAPhi) continue; + if (!LCSSAPhi) break; // All PHINodes need to have a single entry edge, or two if // we already fixed them. @@ -1881,7 +2317,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // incoming bypass edge. if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { // Add an edge coming from the bypass. - LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); + LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); break; } }// end of the LCSSA phi scan. @@ -1893,29 +2329,38 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); // Pick the other block. int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); - (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); + (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx); (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); }// end of for each redux variable. - // The Loop exit block may have single value PHI nodes where the incoming - // value is 'undef'. While vectorizing we only handled real values that - // were defined inside the loop. Here we handle the 'undef case'. - // See PR14725. + fixLCSSAPHIs(); + + // Remove redundant induction instructions. + cse(LoopVectorBody); +} + +void InnerLoopVectorizer::fixLCSSAPHIs() { for (BasicBlock::iterator LEI = LoopExitBlock->begin(), LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); - if (!LCSSAPhi) continue; + if (!LCSSAPhi) break; if (LCSSAPhi->getNumIncomingValues() == 1) LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), LoopMiddleBlock); } -} +} InnerLoopVectorizer::VectorParts InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && "Invalid edge"); + // Look for cached value. + std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst); + EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge); + if (ECEntryIt != MaskCache.end()) + return ECEntryIt->second; + VectorParts SrcMask = createBlockInMask(Src); // The terminator has to be a branch inst! @@ -1931,9 +2376,12 @@ InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { for (unsigned part = 0; part < UF; ++part) EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]); + + MaskCache[Edge] = EdgeMask; return EdgeMask; } + MaskCache[Edge] = SrcMask; return SrcMask; } @@ -1961,154 +2409,185 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { return BlockMask; } -void -InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, - BasicBlock *BB, PhiVector *PV) { - // For each instruction in the old loop. - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - VectorParts &Entry = WidenMap.get(it); - switch (it->getOpcode()) { - case Instruction::Br: - // Nothing to do for PHIs and BR, since we already took care of the - // loop control flow instructions. - continue; - case Instruction::PHI:{ - PHINode* P = cast<PHINode>(it); - // Handle reduction variables: - if (Legal->getReductionVars()->count(P)) { - for (unsigned part = 0; part < UF; ++part) { - // This is phase one of vectorizing PHIs. - Type *VecTy = VectorType::get(it->getType(), VF); - Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", - LoopVectorBody-> getFirstInsertionPt()); - } - PV->push_back(P); - continue; - } +void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, + InnerLoopVectorizer::VectorParts &Entry, + LoopVectorizationLegality *Legal, + unsigned UF, unsigned VF, PhiVector *PV) { + PHINode* P = cast<PHINode>(PN); + // Handle reduction variables: + if (Legal->getReductionVars()->count(P)) { + for (unsigned part = 0; part < UF; ++part) { + // This is phase one of vectorizing PHIs. + Type *VecTy = (VF == 1) ? PN->getType() : + VectorType::get(PN->getType(), VF); + Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", + LoopVectorBody-> getFirstInsertionPt()); + } + PV->push_back(P); + return; + } - // Check for PHI nodes that are lowered to vector selects. - if (P->getParent() != OrigLoop->getHeader()) { - // We know that all PHIs in non header blocks are converted into - // selects, so we don't have to worry about the insertion order and we - // can just use the builder. - // At this point we generate the predication tree. There may be - // duplications since this is a simple recursive scan, but future - // optimizations will clean it up. - - unsigned NumIncoming = P->getNumIncomingValues(); - assert(NumIncoming > 1 && "Invalid PHI"); - - // Generate a sequence of selects of the form: - // SELECT(Mask3, In3, - // SELECT(Mask2, In2, - // ( ...))) - for (unsigned In = 0; In < NumIncoming; In++) { - VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), - P->getParent()); - VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); - - for (unsigned part = 0; part < UF; ++part) { - // We don't need to 'select' the first PHI operand because it is - // the default value if all of the other masks don't match. - if (In == 0) - Entry[part] = In0[part]; - else - // Select between the current value and the previous incoming edge - // based on the incoming mask. - Entry[part] = Builder.CreateSelect(Cond[part], In0[part], - Entry[part], "predphi"); - } - } - continue; + setDebugLocFromInst(Builder, P); + // Check for PHI nodes that are lowered to vector selects. + if (P->getParent() != OrigLoop->getHeader()) { + // We know that all PHIs in non header blocks are converted into + // selects, so we don't have to worry about the insertion order and we + // can just use the builder. + // At this point we generate the predication tree. There may be + // duplications since this is a simple recursive scan, but future + // optimizations will clean it up. + + unsigned NumIncoming = P->getNumIncomingValues(); + + // Generate a sequence of selects of the form: + // SELECT(Mask3, In3, + // SELECT(Mask2, In2, + // ( ...))) + for (unsigned In = 0; In < NumIncoming; In++) { + VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), + P->getParent()); + VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); + + for (unsigned part = 0; part < UF; ++part) { + // We might have single edge PHIs (blocks) - use an identity + // 'select' for the first PHI operand. + if (In == 0) + Entry[part] = Builder.CreateSelect(Cond[part], In0[part], + In0[part]); + else + // Select between the current value and the previous incoming edge + // based on the incoming mask. + Entry[part] = Builder.CreateSelect(Cond[part], In0[part], + Entry[part], "predphi"); } + } + return; + } - // This PHINode must be an induction variable. - // Make sure that we know about it. - assert(Legal->getInductionVars()->count(P) && - "Not an induction variable"); + // This PHINode must be an induction variable. + // Make sure that we know about it. + assert(Legal->getInductionVars()->count(P) && + "Not an induction variable"); - LoopVectorizationLegality::InductionInfo II = - Legal->getInductionVars()->lookup(P); + LoopVectorizationLegality::InductionInfo II = + Legal->getInductionVars()->lookup(P); - switch (II.IK) { - case LoopVectorizationLegality::IK_NoInduction: - llvm_unreachable("Unknown induction"); - case LoopVectorizationLegality::IK_IntInduction: { - assert(P == OldInduction && "Unexpected PHI"); - Value *Broadcasted = getBroadcastInstrs(Induction); + switch (II.IK) { + case LoopVectorizationLegality::IK_NoInduction: + llvm_unreachable("Unknown induction"); + case LoopVectorizationLegality::IK_IntInduction: { + assert(P->getType() == II.StartValue->getType() && "Types must match"); + Type *PhiTy = P->getType(); + Value *Broadcasted; + if (P == OldInduction) { + // Handle the canonical induction variable. We might have had to + // extend the type. + Broadcasted = Builder.CreateTrunc(Induction, PhiTy); + } else { + // Handle other induction variables that are now based on the + // canonical one. + Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, + "normalized.idx"); + NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); + Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx, + "offset.idx"); + } + Broadcasted = getBroadcastInstrs(Broadcasted); + // After broadcasting the induction variable we need to make the vector + // consecutive by adding 0, 1, 2, etc. + for (unsigned part = 0; part < UF; ++part) + Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); + return; + } + case LoopVectorizationLegality::IK_ReverseIntInduction: + case LoopVectorizationLegality::IK_PtrInduction: + case LoopVectorizationLegality::IK_ReversePtrInduction: + // Handle reverse integer and pointer inductions. + Value *StartIdx = ExtendedIdx; + // This is the normalized GEP that starts counting at zero. + Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, + "normalized.idx"); + + // Handle the reverse integer induction variable case. + if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { + IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); + Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, + "resize.norm.idx"); + Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, + "reverse.idx"); + + // This is a new value so do not hoist it out. + Value *Broadcasted = getBroadcastInstrs(ReverseInd); // After broadcasting the induction variable we need to make the - // vector consecutive by adding 0, 1, 2 ... + // vector consecutive by adding ... -3, -2, -1, 0. for (unsigned part = 0; part < UF; ++part) - Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); - continue; + Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, + true); + return; } - case LoopVectorizationLegality::IK_ReverseIntInduction: - case LoopVectorizationLegality::IK_PtrInduction: - case LoopVectorizationLegality::IK_ReversePtrInduction: - // Handle reverse integer and pointer inductions. - Value *StartIdx = 0; - // If we have a single integer induction variable then use it. - // Otherwise, start counting at zero. - if (OldInduction) { - LoopVectorizationLegality::InductionInfo OldII = - Legal->getInductionVars()->lookup(OldInduction); - StartIdx = OldII.StartValue; - } else { - StartIdx = ConstantInt::get(Induction->getType(), 0); - } - // This is the normalized GEP that starts counting at zero. - Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, - "normalized.idx"); - // Handle the reverse integer induction variable case. - if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { - IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); - Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, - "resize.norm.idx"); - Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, - "reverse.idx"); - - // This is a new value so do not hoist it out. - Value *Broadcasted = getBroadcastInstrs(ReverseInd); - // After broadcasting the induction variable we need to make the - // vector consecutive by adding ... -3, -2, -1, 0. - for (unsigned part = 0; part < UF; ++part) - Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, - true); + // Handle the pointer induction variable case. + assert(P->getType()->isPointerTy() && "Unexpected type."); + + // Is this a reverse induction ptr or a consecutive induction ptr. + bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == + II.IK); + + // This is the vector of results. Notice that we don't generate + // vector geps because scalar geps result in better code. + for (unsigned part = 0; part < UF; ++part) { + if (VF == 1) { + int EltIndex = (part) * (Reverse ? -1 : 1); + Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); + Value *GlobalIdx; + if (Reverse) + GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); + else + GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); + + Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, + "next.gep"); + Entry[part] = SclrGep; continue; } - // Handle the pointer induction variable case. - assert(P->getType()->isPointerTy() && "Unexpected type."); - - // Is this a reverse induction ptr or a consecutive induction ptr. - bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == - II.IK); - - // This is the vector of results. Notice that we don't generate - // vector geps because scalar geps result in better code. - for (unsigned part = 0; part < UF; ++part) { - Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); - for (unsigned int i = 0; i < VF; ++i) { - int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); - Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); - Value *GlobalIdx; - if (!Reverse) - GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); - else - GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); - - Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, - "next.gep"); - VecVal = Builder.CreateInsertElement(VecVal, SclrGep, - Builder.getInt32(i), - "insert.gep"); - } - Entry[part] = VecVal; + Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); + for (unsigned int i = 0; i < VF; ++i) { + int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); + Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); + Value *GlobalIdx; + if (!Reverse) + GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); + else + GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); + + Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, + "next.gep"); + VecVal = Builder.CreateInsertElement(VecVal, SclrGep, + Builder.getInt32(i), + "insert.gep"); } - continue; + Entry[part] = VecVal; } + return; + } +} +void +InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, + BasicBlock *BB, PhiVector *PV) { + // For each instruction in the old loop. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + VectorParts &Entry = WidenMap.get(it); + switch (it->getOpcode()) { + case Instruction::Br: + // Nothing to do for PHIs and BR, since we already took care of the + // loop control flow instructions. + continue; + case Instruction::PHI:{ + // Vectorize PHINodes. + widenPHIInstruction(it, Entry, Legal, UF, VF, PV); + continue; }// End of PHI. case Instruction::Add: @@ -2131,6 +2610,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, case Instruction::Xor: { // Just widen binops. BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it); + setDebugLocFromInst(Builder, BinOp); VectorParts &A = getVectorValue(it->getOperand(0)); VectorParts &B = getVectorValue(it->getOperand(1)); @@ -2157,6 +2637,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // instruction with a scalar condition. Otherwise, use vector-select. bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), OrigLoop); + setDebugLocFromInst(Builder, it); // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. @@ -2165,8 +2646,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, VectorParts &Cond = getVectorValue(it->getOperand(0)); VectorParts &Op0 = getVectorValue(it->getOperand(1)); VectorParts &Op1 = getVectorValue(it->getOperand(2)); - Value *ScalarCond = Builder.CreateExtractElement(Cond[0], - Builder.getInt32(0)); + + Value *ScalarCond = (VF == 1) ? Cond[0] : + Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); + for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part] = Builder.CreateSelect( InvariantCond ? ScalarCond : Cond[Part], @@ -2181,6 +2664,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // Widen compares. Generate vector compares. bool FCmp = (it->getOpcode() == Instruction::FCmp); CmpInst *Cmp = dyn_cast<CmpInst>(it); + setDebugLocFromInst(Builder, it); VectorParts &A = getVectorValue(it->getOperand(0)); VectorParts &B = getVectorValue(it->getOperand(1)); for (unsigned Part = 0; Part < UF; ++Part) { @@ -2211,6 +2695,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, case Instruction::FPTrunc: case Instruction::BitCast: { CastInst *CI = dyn_cast<CastInst>(it); + setDebugLocFromInst(Builder, it); /// Optimize the special case where the source is the induction /// variable. Notice that we can only optimize the 'trunc' case /// because: a. FP conversions lose precision, b. sext/zext may wrap, @@ -2225,7 +2710,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, break; } /// Vectorize casts. - Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); + Type *DestTy = (VF == 1) ? CI->getType() : + VectorType::get(CI->getType(), VF); VectorParts &A = getVectorValue(it->getOperand(0)); for (unsigned Part = 0; Part < UF; ++Part) @@ -2237,20 +2723,32 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // Ignore dbg intrinsics. if (isa<DbgInfoIntrinsic>(it)) break; + setDebugLocFromInst(Builder, it); Module *M = BB->getParent()->getParent(); CallInst *CI = cast<CallInst>(it); Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); assert(ID && "Not an intrinsic call!"); - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector<Value*, 4> Args; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); - Args.push_back(Arg[Part]); + switch (ID) { + case Intrinsic::lifetime_end: + case Intrinsic::lifetime_start: + scalarizeInstruction(it); + break; + default: + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector<Value *, 4> Args; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); + Args.push_back(Arg[Part]); + } + Type *Tys[] = {CI->getType()}; + if (VF > 1) + Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF); + + Function *F = Intrinsic::getDeclaration(M, ID, Tys); + Entry[Part] = Builder.CreateCall(F, Args); } - Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) }; - Function *F = Intrinsic::getDeclaration(M, ID, Tys); - Entry[Part] = Builder.CreateCall(F, Args); + break; } break; } @@ -2283,24 +2781,65 @@ void InnerLoopVectorizer::updateAnalysis() { DEBUG(DT->verifyAnalysis()); } +/// \brief Check whether it is safe to if-convert this phi node. +/// +/// Phi nodes with constant expressions that can trap are not safe to if +/// convert. +static bool canIfConvertPHINodes(BasicBlock *BB) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + PHINode *Phi = dyn_cast<PHINode>(I); + if (!Phi) + return true; + for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p) + if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p))) + if (C->canTrap()) + return false; + } + return true; +} + bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!EnableIfConversion) return false; assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); - std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector(); + + // A list of pointers that we can safely read and write to. + SmallPtrSet<Value *, 8> SafePointes; + + // Collect safe addresses. + for (Loop::block_iterator BI = TheLoop->block_begin(), + BE = TheLoop->block_end(); BI != BE; ++BI) { + BasicBlock *BB = *BI; + + if (blockNeedsPredication(BB)) + continue; + + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + SafePointes.insert(LI->getPointerOperand()); + else if (StoreInst *SI = dyn_cast<StoreInst>(I)) + SafePointes.insert(SI->getPointerOperand()); + } + } // Collect the blocks that need predication. - for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { - BasicBlock *BB = LoopBlocks[i]; + BasicBlock *Header = TheLoop->getHeader(); + for (Loop::block_iterator BI = TheLoop->block_begin(), + BE = TheLoop->block_end(); BI != BE; ++BI) { + BasicBlock *BB = *BI; // We don't support switch statements inside loops. if (!isa<BranchInst>(BB->getTerminator())) return false; // We must be able to predicate all blocks that need to be predicated. - if (blockNeedsPredication(BB) && !blockCanBePredicated(BB)) + if (blockNeedsPredication(BB)) { + if (!blockCanBePredicated(BB, SafePointes)) + return false; + } else if (BB != Header && !canIfConvertPHINodes(BB)) return false; + } // We can if-convert this loop. @@ -2325,27 +2864,26 @@ bool LoopVectorizationLegality::canVectorize() { if (!TheLoop->getExitingBlock()) return false; - unsigned NumBlocks = TheLoop->getNumBlocks(); + // We need to have a loop header. + DEBUG(dbgs() << "LV: Found a loop: " << + TheLoop->getHeader()->getName() << '\n'); // Check if we can if-convert non single-bb loops. + unsigned NumBlocks = TheLoop->getNumBlocks(); if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); return false; } - // We need to have a loop header. - BasicBlock *Latch = TheLoop->getLoopLatch(); - DEBUG(dbgs() << "LV: Found a loop: " << - TheLoop->getHeader()->getName() << "\n"); - // ScalarEvolution needs to be able to find the exit count. - const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch); + const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); if (ExitCount == SE->getCouldNotCompute()) { DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; } // Do not loop-vectorize loops with a tiny trip count. + BasicBlock *Latch = TheLoop->getLoopLatch(); unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); if (TC > 0u && TC < TinyTripCountVectorThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << @@ -2378,6 +2916,26 @@ bool LoopVectorizationLegality::canVectorize() { return true; } +static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) { + if (Ty->isPointerTy()) + return DL.getIntPtrType(Ty); + + // It is possible that char's or short's overflow when we ask for the loop's + // trip count, work around this by changing the type size. + if (Ty->getScalarSizeInBits() < 32) + return Type::getInt32Ty(Ty->getContext()); + + return Ty; +} + +static Type* getWiderType(DataLayout &DL, Type *Ty0, Type *Ty1) { + Ty0 = convertPointerToIntegerType(DL, Ty0); + Ty1 = convertPointerToIntegerType(DL, Ty1); + if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) + return Ty0; + return Ty1; +} + /// \brief Check that the instruction has outside loop users and is not an /// identified reduction variable. static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, @@ -2391,7 +2949,7 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, Instruction *U = cast<Instruction>(*I); // This user may be a reduction exit value. if (!TheLoop->contains(U)) { - DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); + DEBUG(dbgs() << "LV: Found an outside user for : " << *U << '\n'); return true; } } @@ -2402,13 +2960,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { BasicBlock *PreHeader = TheLoop->getLoopPreheader(); BasicBlock *Header = TheLoop->getHeader(); - // If we marked the scalar loop as "already vectorized" then no need - // to vectorize it again. - if (Header->getTerminator()->getMetadata(AlreadyVectorizedMDName)) { - DEBUG(dbgs() << "LV: This loop was vectorized before\n"); - return false; - } - // Look for the attribute signaling the absence of NaNs. Function &F = *Header->getParent(); if (F.hasFnAttribute("no-nans-fp-math")) @@ -2425,10 +2976,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { ++it) { if (PHINode *Phi = dyn_cast<PHINode>(it)) { + Type *PhiTy = Phi->getType(); // Check that this PHI type is allowed. - if (!Phi->getType()->isIntegerTy() && - !Phi->getType()->isFloatingPointTy() && - !Phi->getType()->isPointerTy()) { + if (!PhiTy->isIntegerTy() && + !PhiTy->isFloatingPointTy() && + !PhiTy->isPointerTy()) { DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; } @@ -2456,17 +3008,29 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { InductionKind IK = isInductionVariable(Phi); if (IK_NoInduction != IK) { + // Get the widest type. + if (!WidestIndTy) + WidestIndTy = convertPointerToIntegerType(*DL, PhiTy); + else + WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy); + // Int inductions are special because we only allow one IV. if (IK == IK_IntInduction) { - if (Induction) { - DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); - return false; - } - Induction = Phi; + // Use the phi node with the widest type as induction. Use the last + // one if there are multiple (no good reason for doing this other + // than it is expedient). + if (!Induction || PhiTy == WidestIndTy) + Induction = Phi; } DEBUG(dbgs() << "LV: Found an induction variable.\n"); Inductions[Phi] = InductionInfo(StartValue, IK); + + // Until we explicitly handle the case of an induction variable with + // an outside loop user we have to give up vectorizing this loop. + if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) + return false; + continue; } @@ -2503,7 +3067,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { continue; } if (AddReductionVar(Phi, RK_FloatMinMax)) { - DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<"\n"); + DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi << + "\n"); continue; } @@ -2520,9 +3085,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } // Check that the instruction return type is vectorizable. - if (!VectorType::isValidElementType(it->getType()) && - !it->getType()->isVoidTy()) { - DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); + // Also, we can't vectorize extractelement instructions. + if ((!VectorType::isValidElementType(it->getType()) && + !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) { + DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; } @@ -2544,7 +3110,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!Induction) { DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); - assert(getInductionVars()->size() && "No induction variables"); + if (Inductions.empty()) + return false; } return true; @@ -2573,59 +3140,715 @@ void LoopVectorizationLegality::collectLoopUniforms() { Uniforms.insert(I); // Insert all operands. - for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) { - Worklist.push_back(I->getOperand(i)); - } + Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); } } -AliasAnalysis::Location -LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) { - if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) - return AA->getLocation(Store); - else if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) - return AA->getLocation(Load); +namespace { +/// \brief Analyses memory accesses in a loop. +/// +/// Checks whether run time pointer checks are needed and builds sets for data +/// dependence checking. +class AccessAnalysis { +public: + /// \brief Read or write access location. + typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; + typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; + + /// \brief Set of potential dependent memory accesses. + typedef EquivalenceClasses<MemAccessInfo> DepCandidates; + + AccessAnalysis(DataLayout *Dl, DepCandidates &DA) : + DL(Dl), DepCands(DA), AreAllWritesIdentified(true), + AreAllReadsIdentified(true), IsRTCheckNeeded(false) {} + + /// \brief Register a load and whether it is only read from. + void addLoad(Value *Ptr, bool IsReadOnly) { + Accesses.insert(MemAccessInfo(Ptr, false)); + if (IsReadOnly) + ReadOnlyPtr.insert(Ptr); + } - llvm_unreachable("Should be either load or store instruction"); + /// \brief Register a store. + void addStore(Value *Ptr) { + Accesses.insert(MemAccessInfo(Ptr, true)); + } + + /// \brief Check whether we can check the pointers at runtime for + /// non-intersection. + bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck, + unsigned &NumComparisons, ScalarEvolution *SE, + Loop *TheLoop, bool ShouldCheckStride = false); + + /// \brief Goes over all memory accesses, checks whether a RT check is needed + /// and builds sets of dependent accesses. + void buildDependenceSets() { + // Process read-write pointers first. + processMemAccesses(false); + // Next, process read pointers. + processMemAccesses(true); + } + + bool isRTCheckNeeded() { return IsRTCheckNeeded; } + + bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } + void resetDepChecks() { CheckDeps.clear(); } + + MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } + +private: + typedef SetVector<MemAccessInfo> PtrAccessSet; + typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap; + + /// \brief Go over all memory access or only the deferred ones if + /// \p UseDeferred is true and check whether runtime pointer checks are needed + /// and build sets of dependency check candidates. + void processMemAccesses(bool UseDeferred); + + /// Set of all accesses. + PtrAccessSet Accesses; + + /// Set of access to check after all writes have been processed. + PtrAccessSet DeferredAccesses; + + /// Map of pointers to last access encountered. + UnderlyingObjToAccessMap ObjToLastAccess; + + /// Set of accesses that need a further dependence check. + MemAccessInfoSet CheckDeps; + + /// Set of pointers that are read only. + SmallPtrSet<Value*, 16> ReadOnlyPtr; + + /// Set of underlying objects already written to. + SmallPtrSet<Value*, 16> WriteObjects; + + DataLayout *DL; + + /// Sets of potentially dependent accesses - members of one set share an + /// underlying pointer. The set "CheckDeps" identfies which sets really need a + /// dependence check. + DepCandidates &DepCands; + + bool AreAllWritesIdentified; + bool AreAllReadsIdentified; + bool IsRTCheckNeeded; +}; + +} // end anonymous namespace + +/// \brief Check whether a pointer can participate in a runtime bounds check. +static bool hasComputableBounds(ScalarEvolution *SE, Value *Ptr) { + const SCEV *PtrScev = SE->getSCEV(Ptr); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); + if (!AR) + return false; + + return AR->isAffine(); } -bool -LoopVectorizationLegality::hasPossibleGlobalWriteReorder( - Value *Object, - Instruction *Inst, - AliasMultiMap& WriteObjects, - unsigned MaxByteWidth) { +/// \brief Check the stride of the pointer and ensure that it does not wrap in +/// the address space. +static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr, + const Loop *Lp); + +bool AccessAnalysis::canCheckPtrAtRT( + LoopVectorizationLegality::RuntimePointerCheck &RtCheck, + unsigned &NumComparisons, ScalarEvolution *SE, + Loop *TheLoop, bool ShouldCheckStride) { + // Find pointers with computable bounds. We are going to use this information + // to place a runtime bound check. + unsigned NumReadPtrChecks = 0; + unsigned NumWritePtrChecks = 0; + bool CanDoRT = true; + + bool IsDepCheckNeeded = isDependencyCheckNeeded(); + // We assign consecutive id to access from different dependence sets. + // Accesses within the same set don't need a runtime check. + unsigned RunningDepId = 1; + DenseMap<Value *, unsigned> DepSetId; + + for (PtrAccessSet::iterator AI = Accesses.begin(), AE = Accesses.end(); + AI != AE; ++AI) { + const MemAccessInfo &Access = *AI; + Value *Ptr = Access.getPointer(); + bool IsWrite = Access.getInt(); + + // Just add write checks if we have both. + if (!IsWrite && Accesses.count(MemAccessInfo(Ptr, true))) + continue; + + if (IsWrite) + ++NumWritePtrChecks; + else + ++NumReadPtrChecks; + + if (hasComputableBounds(SE, Ptr) && + // When we run after a failing dependency check we have to make sure we + // don't have wrapping pointers. + (!ShouldCheckStride || isStridedPtr(SE, DL, Ptr, TheLoop) == 1)) { + // The id of the dependence set. + unsigned DepId; + + if (IsDepCheckNeeded) { + Value *Leader = DepCands.getLeaderValue(Access).getPointer(); + unsigned &LeaderId = DepSetId[Leader]; + if (!LeaderId) + LeaderId = RunningDepId++; + DepId = LeaderId; + } else + // Each access has its own dependence set. + DepId = RunningDepId++; + + RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId); + + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n'); + } else { + CanDoRT = false; + } + } - AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst); + if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) + NumComparisons = 0; // Only one dependence set. + else { + NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks + + NumWritePtrChecks - 1)); + } - std::vector<Instruction*>::iterator - it = WriteObjects[Object].begin(), - end = WriteObjects[Object].end(); + // If the pointers that we would use for the bounds comparison have different + // address spaces, assume the values aren't directly comparable, so we can't + // use them for the runtime check. We also have to assume they could + // overlap. In the future there should be metadata for whether address spaces + // are disjoint. + unsigned NumPointers = RtCheck.Pointers.size(); + for (unsigned i = 0; i < NumPointers; ++i) { + for (unsigned j = i + 1; j < NumPointers; ++j) { + // Only need to check pointers between two different dependency sets. + if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j]) + continue; + + Value *PtrI = RtCheck.Pointers[i]; + Value *PtrJ = RtCheck.Pointers[j]; + + unsigned ASi = PtrI->getType()->getPointerAddressSpace(); + unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); + if (ASi != ASj) { + DEBUG(dbgs() << "LV: Runtime check would require comparison between" + " different address spaces\n"); + return false; + } + } + } + + return CanDoRT; +} + +static bool isFunctionScopeIdentifiedObject(Value *Ptr) { + return isNoAliasArgument(Ptr) || isNoAliasCall(Ptr) || isa<AllocaInst>(Ptr); +} - for (; it != end; ++it) { - Instruction* I = *it; - if (I == Inst) +void AccessAnalysis::processMemAccesses(bool UseDeferred) { + // We process the set twice: first we process read-write pointers, last we + // process read-only pointers. This allows us to skip dependence tests for + // read-only pointers. + + PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; + for (PtrAccessSet::iterator AI = S.begin(), AE = S.end(); AI != AE; ++AI) { + const MemAccessInfo &Access = *AI; + Value *Ptr = Access.getPointer(); + bool IsWrite = Access.getInt(); + + DepCands.insert(Access); + + // Memorize read-only pointers for later processing and skip them in the + // first round (they need to be checked after we have seen all write + // pointers). Note: we also mark pointer that are not consecutive as + // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need the + // second check for "!IsWrite". + bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; + if (!UseDeferred && IsReadOnlyPtr) { + DeferredAccesses.insert(Access); continue; + } + + bool NeedDepCheck = false; + // Check whether there is the possiblity of dependency because of underlying + // objects being the same. + typedef SmallVector<Value*, 16> ValueVector; + ValueVector TempObjects; + GetUnderlyingObjects(Ptr, TempObjects, DL); + for (ValueVector::iterator UI = TempObjects.begin(), UE = TempObjects.end(); + UI != UE; ++UI) { + Value *UnderlyingObj = *UI; + + // If this is a write then it needs to be an identified object. If this a + // read and all writes (so far) are identified function scope objects we + // don't need an identified underlying object but only an Argument (the + // next write is going to invalidate this assumption if it is + // unidentified). + // This is a micro-optimization for the case where all writes are + // identified and we have one argument pointer. + // Otherwise, we do need a runtime check. + if ((IsWrite && !isFunctionScopeIdentifiedObject(UnderlyingObj)) || + (!IsWrite && (!AreAllWritesIdentified || + !isa<Argument>(UnderlyingObj)) && + !isIdentifiedObject(UnderlyingObj))) { + DEBUG(dbgs() << "LV: Found an unidentified " << + (IsWrite ? "write" : "read" ) << " ptr: " << *UnderlyingObj << + "\n"); + IsRTCheckNeeded = (IsRTCheckNeeded || + !isIdentifiedObject(UnderlyingObj) || + !AreAllReadsIdentified); + + if (IsWrite) + AreAllWritesIdentified = false; + if (!IsWrite) + AreAllReadsIdentified = false; + } + + // If this is a write - check other reads and writes for conflicts. If + // this is a read only check other writes for conflicts (but only if there + // is no other write to the ptr - this is an optimization to catch "a[i] = + // a[i] + " without having to do a dependence check). + if ((IsWrite || IsReadOnlyPtr) && WriteObjects.count(UnderlyingObj)) + NeedDepCheck = true; + + if (IsWrite) + WriteObjects.insert(UnderlyingObj); + + // Create sets of pointers connected by shared underlying objects. + UnderlyingObjToAccessMap::iterator Prev = + ObjToLastAccess.find(UnderlyingObj); + if (Prev != ObjToLastAccess.end()) + DepCands.unionSets(Access, Prev->second); + + ObjToLastAccess[UnderlyingObj] = Access; + } + + if (NeedDepCheck) + CheckDeps.insert(Access); + } +} + +namespace { +/// \brief Checks memory dependences among accesses to the same underlying +/// object to determine whether there vectorization is legal or not (and at +/// which vectorization factor). +/// +/// This class works under the assumption that we already checked that memory +/// locations with different underlying pointers are "must-not alias". +/// We use the ScalarEvolution framework to symbolically evalutate access +/// functions pairs. Since we currently don't restructure the loop we can rely +/// on the program order of memory accesses to determine their safety. +/// At the moment we will only deem accesses as safe for: +/// * A negative constant distance assuming program order. +/// +/// Safe: tmp = a[i + 1]; OR a[i + 1] = x; +/// a[i] = tmp; y = a[i]; +/// +/// The latter case is safe because later checks guarantuee that there can't +/// be a cycle through a phi node (that is, we check that "x" and "y" is not +/// the same variable: a header phi can only be an induction or a reduction, a +/// reduction can't have a memory sink, an induction can't have a memory +/// source). This is important and must not be violated (or we have to +/// resort to checking for cycles through memory). +/// +/// * A positive constant distance assuming program order that is bigger +/// than the biggest memory access. +/// +/// tmp = a[i] OR b[i] = x +/// a[i+2] = tmp y = b[i+2]; +/// +/// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively. +/// +/// * Zero distances and all accesses have the same size. +/// +class MemoryDepChecker { +public: + typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; + typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; + + MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) + : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), + ShouldRetryWithRuntimeCheck(false) {} + + /// \brief Register the location (instructions are given increasing numbers) + /// of a write access. + void addAccess(StoreInst *SI) { + Value *Ptr = SI->getPointerOperand(); + Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx); + InstMap.push_back(SI); + ++AccessIdx; + } + + /// \brief Register the location (instructions are given increasing numbers) + /// of a write access. + void addAccess(LoadInst *LI) { + Value *Ptr = LI->getPointerOperand(); + Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx); + InstMap.push_back(LI); + ++AccessIdx; + } + + /// \brief Check whether the dependencies between the accesses are safe. + /// + /// Only checks sets with elements in \p CheckDeps. + bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, + MemAccessInfoSet &CheckDeps); + + /// \brief The maximum number of bytes of a vector register we can vectorize + /// the accesses safely with. + unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + + /// \brief In same cases when the dependency check fails we can still + /// vectorize the loop with a dynamic array access check. + bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } + +private: + ScalarEvolution *SE; + DataLayout *DL; + const Loop *InnermostLoop; + + /// \brief Maps access locations (ptr, read/write) to program order. + DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses; + + /// \brief Memory access instructions in program order. + SmallVector<Instruction *, 16> InstMap; + + /// \brief The program order index to be used for the next instruction. + unsigned AccessIdx; + + // We can access this many bytes in parallel safely. + unsigned MaxSafeDepDistBytes; + + /// \brief If we see a non constant dependence distance we can still try to + /// vectorize this loop with runtime checks. + bool ShouldRetryWithRuntimeCheck; + + /// \brief Check whether there is a plausible dependence between the two + /// accesses. + /// + /// Access \p A must happen before \p B in program order. The two indices + /// identify the index into the program order map. + /// + /// This function checks whether there is a plausible dependence (or the + /// absence of such can't be proved) between the two accesses. If there is a + /// plausible dependence but the dependence distance is bigger than one + /// element access it records this distance in \p MaxSafeDepDistBytes (if this + /// distance is smaller than any other distance encountered so far). + /// Otherwise, this function returns true signaling a possible dependence. + bool isDependent(const MemAccessInfo &A, unsigned AIdx, + const MemAccessInfo &B, unsigned BIdx); + + /// \brief Check whether the data dependence could prevent store-load + /// forwarding. + bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize); +}; + +} // end anonymous namespace + +static bool isInBoundsGep(Value *Ptr) { + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) + return GEP->isInBounds(); + return false; +} - AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I); - if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth), - ThatLoc.getWithNewSize(MaxByteWidth))) +/// \brief Check whether the access through \p Ptr has a constant stride. +static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr, + const Loop *Lp) { + const Type *Ty = Ptr->getType(); + assert(Ty->isPointerTy() && "Unexpected non ptr"); + + // Make sure that the pointer does not point to aggregate types. + const PointerType *PtrTy = cast<PointerType>(Ty); + if (PtrTy->getElementType()->isAggregateType()) { + DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr << + "\n"); + return 0; + } + + const SCEV *PtrScev = SE->getSCEV(Ptr); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); + if (!AR) { + DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " + << *Ptr << " SCEV: " << *PtrScev << "\n"); + return 0; + } + + // The accesss function must stride over the innermost loop. + if (Lp != AR->getLoop()) { + DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " << + *Ptr << " SCEV: " << *PtrScev << "\n"); + } + + // The address calculation must not wrap. Otherwise, a dependence could be + // inverted. + // An inbounds getelementptr that is a AddRec with a unit stride + // cannot wrap per definition. The unit stride requirement is checked later. + // An getelementptr without an inbounds attribute and unit stride would have + // to access the pointer value "0" which is undefined behavior in address + // space 0, therefore we can also vectorize this case. + bool IsInBoundsGEP = isInBoundsGep(Ptr); + bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask); + bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0; + if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) { + DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space " + << *Ptr << " SCEV: " << *PtrScev << "\n"); + return 0; + } + + // Check the step is constant. + const SCEV *Step = AR->getStepRecurrence(*SE); + + // Calculate the pointer stride and check if it is consecutive. + const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); + if (!C) { + DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr << + " SCEV: " << *PtrScev << "\n"); + return 0; + } + + int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType()); + const APInt &APStepVal = C->getValue()->getValue(); + + // Huge step value - give up. + if (APStepVal.getBitWidth() > 64) + return 0; + + int64_t StepVal = APStepVal.getSExtValue(); + + // Strided access. + int64_t Stride = StepVal / Size; + int64_t Rem = StepVal % Size; + if (Rem) + return 0; + + // If the SCEV could wrap but we have an inbounds gep with a unit stride we + // know we can't "wrap around the address space". In case of address space + // zero we know that this won't happen without triggering undefined behavior. + if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) && + Stride != 1 && Stride != -1) + return 0; + + return Stride; +} + +bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance, + unsigned TypeByteSize) { + // If loads occur at a distance that is not a multiple of a feasible vector + // factor store-load forwarding does not take place. + // Positive dependences might cause troubles because vectorizing them might + // prevent store-load forwarding making vectorized code run a lot slower. + // a[i] = a[i-3] ^ a[i-8]; + // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and + // hence on your typical architecture store-load forwarding does not take + // place. Vectorizing in such cases does not make sense. + // Store-load forwarding distance. + const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize; + // Maximum vector factor. + unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize; + if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues) + MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes; + + for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues; + vf *= 2) { + if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) { + MaxVFWithoutSLForwardIssues = (vf >>=1); + break; + } + } + + if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) { + DEBUG(dbgs() << "LV: Distance " << Distance << + " that could cause a store-load forwarding conflict\n"); + return true; + } + + if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes && + MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize) + MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues; + return false; +} + +bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, + const MemAccessInfo &B, unsigned BIdx) { + assert (AIdx < BIdx && "Must pass arguments in program order"); + + Value *APtr = A.getPointer(); + Value *BPtr = B.getPointer(); + bool AIsWrite = A.getInt(); + bool BIsWrite = B.getInt(); + + // Two reads are independent. + if (!AIsWrite && !BIsWrite) + return false; + + const SCEV *AScev = SE->getSCEV(APtr); + const SCEV *BScev = SE->getSCEV(BPtr); + + int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop); + int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop); + + const SCEV *Src = AScev; + const SCEV *Sink = BScev; + + // If the induction step is negative we have to invert source and sink of the + // dependence. + if (StrideAPtr < 0) { + //Src = BScev; + //Sink = AScev; + std::swap(APtr, BPtr); + std::swap(Src, Sink); + std::swap(AIsWrite, BIsWrite); + std::swap(AIdx, BIdx); + std::swap(StrideAPtr, StrideBPtr); + } + + const SCEV *Dist = SE->getMinusSCEV(Sink, Src); + + DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink + << "(Induction step: " << StrideAPtr << ")\n"); + DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to " + << *InstMap[BIdx] << ": " << *Dist << "\n"); + + // Need consecutive accesses. We don't want to vectorize + // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in + // the address space. + if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){ + DEBUG(dbgs() << "Non-consecutive pointer access\n"); + return true; + } + + const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist); + if (!C) { + DEBUG(dbgs() << "LV: Dependence because of non constant distance\n"); + ShouldRetryWithRuntimeCheck = true; + return true; + } + + Type *ATy = APtr->getType()->getPointerElementType(); + Type *BTy = BPtr->getType()->getPointerElementType(); + unsigned TypeByteSize = DL->getTypeAllocSize(ATy); + + // Negative distances are not plausible dependencies. + const APInt &Val = C->getValue()->getValue(); + if (Val.isNegative()) { + bool IsTrueDataDependence = (AIsWrite && !BIsWrite); + if (IsTrueDataDependence && + (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) || + ATy != BTy)) return true; + + DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n"); + return false; + } + + // Write to the same location with the same size. + // Could be improved to assert type sizes are the same (i32 == float, etc). + if (Val == 0) { + if (ATy == BTy) + return false; + DEBUG(dbgs() << "LV: Zero dependence difference but different types\n"); + return true; + } + + assert(Val.isStrictlyPositive() && "Expect a positive value"); + + // Positive distance bigger than max vectorization factor. + if (ATy != BTy) { + DEBUG(dbgs() << + "LV: ReadWrite-Write positive dependency with different types\n"); + return false; } + + unsigned Distance = (unsigned) Val.getZExtValue(); + + // Bail out early if passed-in parameters make vectorization not feasible. + unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1; + unsigned ForcedUnroll = VectorizationUnroll ? VectorizationUnroll : 1; + + // The distance must be bigger than the size needed for a vectorized version + // of the operation and the size of the vectorized operation must not be + // bigger than the currrent maximum size. + if (Distance < 2*TypeByteSize || + 2*TypeByteSize > MaxSafeDepDistBytes || + Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { + DEBUG(dbgs() << "LV: Failure because of Positive distance " + << Val.getSExtValue() << '\n'); + return true; + } + + MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? + Distance : MaxSafeDepDistBytes; + + bool IsTrueDataDependence = (!AIsWrite && BIsWrite); + if (IsTrueDataDependence && + couldPreventStoreLoadForward(Distance, TypeByteSize)) + return true; + + DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() << + " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'); + return false; } +bool +MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, + MemAccessInfoSet &CheckDeps) { + + MaxSafeDepDistBytes = -1U; + while (!CheckDeps.empty()) { + MemAccessInfo CurAccess = *CheckDeps.begin(); + + // Get the relevant memory access set. + EquivalenceClasses<MemAccessInfo>::iterator I = + AccessSets.findValue(AccessSets.getLeaderValue(CurAccess)); + + // Check accesses within this set. + EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE; + AI = AccessSets.member_begin(I), AE = AccessSets.member_end(); + + // Check every access pair. + while (AI != AE) { + CheckDeps.erase(*AI); + EquivalenceClasses<MemAccessInfo>::member_iterator OI = llvm::next(AI); + while (OI != AE) { + // Check every accessing instruction pair in program order. + for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(), + I1E = Accesses[*AI].end(); I1 != I1E; ++I1) + for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(), + I2E = Accesses[*OI].end(); I2 != I2E; ++I2) { + if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2)) + return false; + if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1)) + return false; + } + ++OI; + } + AI++; + } + } + return true; +} + bool LoopVectorizationLegality::canVectorizeMemory() { typedef SmallVector<Value*, 16> ValueVector; typedef SmallPtrSet<Value*, 16> ValueSet; + // Holds the Load and Store *instructions*. ValueVector Loads; ValueVector Stores; + + // Holds all the different accesses in the loop. + unsigned NumReads = 0; + unsigned NumReadWrites = 0; + PtrRtCheck.Pointers.clear(); PtrRtCheck.Need = false; const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); + MemoryDepChecker DepChecker(SE, DL, TheLoop); // For each block. for (Loop::block_iterator bb = TheLoop->block_begin(), @@ -2639,6 +3862,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // but is not a load, then we quit. Notice that we don't handle function // calls that read or write. if (it->mayReadFromMemory()) { + // Many math library functions read the rounding mode. We will only + // vectorize a loop if it contains known function calls that don't set + // the flag. Therefore, it is safe to ignore this read from memory. + CallInst *Call = dyn_cast<CallInst>(it); + if (Call && getIntrinsicIDForCall(Call, TLI)) + continue; + LoadInst *Ld = dyn_cast<LoadInst>(it); if (!Ld) return false; if (!Ld->isSimple() && !IsAnnotatedParallel) { @@ -2646,6 +3876,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } Loads.push_back(Ld); + DepChecker.addAccess(Ld); continue; } @@ -2658,9 +3889,10 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } Stores.push_back(St); + DepChecker.addAccess(St); } - } // next instr. - } // next block. + } // Next instr. + } // Next block. // Now we have two lists that hold the loads and the stores. // Next, we find the pointers that they use. @@ -2672,10 +3904,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return true; } - // Holds the read and read-write *pointers* that we find. These maps hold - // unique values for pointers (so no need for multi-map). - AliasMap Reads; - AliasMap ReadWrites; + AccessAnalysis::DepCandidates DependentAccesses; + AccessAnalysis Accesses(DL, DependentAccesses); // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects // multiple times on the same object. If the ptr is accessed twice, once @@ -2694,10 +3924,12 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } - // If we did *not* see this pointer before, insert it to - // the read-write list. At this phase it is only a 'write' list. - if (Seen.insert(Ptr)) - ReadWrites.insert(std::make_pair(Ptr, ST)); + // If we did *not* see this pointer before, insert it to the read-write + // list. At this phase it is only a 'write' list. + if (Seen.insert(Ptr)) { + ++NumReadWrites; + Accesses.addStore(Ptr); + } } if (IsAnnotatedParallel) { @@ -2718,51 +3950,44 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // If the address of i is unknown (for example A[B[i]]) then we may // read a few words, modify, and write a few words, and some of the // words may be written to the same address. - if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr)) - Reads.insert(std::make_pair(Ptr, LD)); + bool IsReadOnlyPtr = false; + if (Seen.insert(Ptr) || !isStridedPtr(SE, DL, Ptr, TheLoop)) { + ++NumReads; + IsReadOnlyPtr = true; + } + Accesses.addLoad(Ptr, IsReadOnlyPtr); } // If we write (or read-write) to a single destination and there are no // other reads in this loop then is it safe to vectorize. - if (ReadWrites.size() == 1 && Reads.size() == 0) { + if (NumReadWrites == 1 && NumReads == 0) { DEBUG(dbgs() << "LV: Found a write-only loop!\n"); return true; } - unsigned NumReadPtrs = 0; - unsigned NumWritePtrs = 0; + // Build dependence sets and check whether we need a runtime pointer bounds + // check. + Accesses.buildDependenceSets(); + bool NeedRTCheck = Accesses.isRTCheckNeeded(); // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. - bool CanDoRT = true; - AliasMap::iterator MI, ME; - for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) { - Value *V = (*MI).first; - if (hasComputableBounds(V)) { - PtrRtCheck.insert(SE, TheLoop, V, true); - NumWritePtrs++; - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); - } else { - CanDoRT = false; - break; - } - } - for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) { - Value *V = (*MI).first; - if (hasComputableBounds(V)) { - PtrRtCheck.insert(SE, TheLoop, V, false); - NumReadPtrs++; - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); - } else { - CanDoRT = false; - break; - } - } + unsigned NumComparisons = 0; + bool CanDoRT = false; + if (NeedRTCheck) + CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop); + + + DEBUG(dbgs() << "LV: We need to do " << NumComparisons << + " pointer comparisons.\n"); - // Check that we did not collect too many pointers or found a - // unsizeable pointer. - unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1)); - DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n"); + // If we only have one set of dependences to check pointers among we don't + // need a runtime check. + if (NumComparisons == 0 && NeedRTCheck) + NeedRTCheck = false; + + // Check that we did not collect too many pointers or found an unsizeable + // pointer. if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { PtrRtCheck.reset(); CanDoRT = false; @@ -2772,122 +3997,69 @@ bool LoopVectorizationLegality::canVectorizeMemory() { DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); } - bool NeedRTCheck = false; - - // Biggest vectorized access possible, vector width * unroll factor. - // TODO: We're being very pessimistic here, find a way to know the - // real access width before getting here. - unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) * - TTI->getMaximumUnrollFactor(); - // Now that the pointers are in two lists (Reads and ReadWrites), we - // can check that there are no conflicts between each of the writes and - // between the writes to the reads. - // Note that WriteObjects duplicates the stores (indexed now by underlying - // objects) to avoid pointing to elements inside ReadWrites. - // TODO: Maybe create a new type where they can interact without duplication. - AliasMultiMap WriteObjects; - ValueVector TempObjects; - - // Check that the read-writes do not conflict with other read-write - // pointers. - bool AllWritesIdentified = true; - for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) { - Value *Val = (*MI).first; - Instruction *Inst = (*MI).second; - - GetUnderlyingObjects(Val, TempObjects, DL); - for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end(); - UI != UE; ++UI) { - if (!isIdentifiedObject(*UI)) { - DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n"); - NeedRTCheck = true; - AllWritesIdentified = false; - } + if (NeedRTCheck && !CanDoRT) { + DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << + "the array bounds.\n"); + PtrRtCheck.reset(); + return false; + } - // Never seen it before, can't alias. - if (WriteObjects[*UI].empty()) { - DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n"); - WriteObjects[*UI].push_back(Inst); - continue; - } - // Direct alias found. - if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) { - DEBUG(dbgs() << "LV: Found a possible write-write reorder:" - << **UI <<"\n"); - return false; - } - DEBUG(dbgs() << "LV: Found a conflicting global value:" - << **UI <<"\n"); - DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n"); - DEBUG(dbgs() << "LV: On value:" << *Val <<"\n"); - - // If global alias, make sure they do alias. - if (hasPossibleGlobalWriteReorder(*UI, - Inst, - WriteObjects, - MaxByteWidth)) { - DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI - << "\n"); + PtrRtCheck.Need = NeedRTCheck; + + bool CanVecMem = true; + if (Accesses.isDependencyCheckNeeded()) { + DEBUG(dbgs() << "LV: Checking memory dependencies\n"); + CanVecMem = DepChecker.areDepsSafe(DependentAccesses, + Accesses.getDependenciesToCheck()); + MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); + + if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) { + DEBUG(dbgs() << "LV: Retrying with memory checks\n"); + NeedRTCheck = true; + + // Clear the dependency checks. We assume they are not needed. + Accesses.resetDepChecks(); + + PtrRtCheck.reset(); + PtrRtCheck.Need = true; + + CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, + TheLoop, true); + // Check that we did not collect too many pointers or found an unsizeable + // pointer. + if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { + DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n"); + PtrRtCheck.reset(); return false; } - // Didn't alias, insert into map for further reference. - WriteObjects[*UI].push_back(Inst); + CanVecMem = true; } - TempObjects.clear(); } - /// Check that the reads don't conflict with the read-writes. - for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) { - Value *Val = (*MI).first; - GetUnderlyingObjects(Val, TempObjects, DL); - for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end(); - UI != UE; ++UI) { - // If all of the writes are identified then we don't care if the read - // pointer is identified or not. - if (!AllWritesIdentified && !isIdentifiedObject(*UI)) { - DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n"); - NeedRTCheck = true; - } + DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << + " need a runtime memory check.\n"); - // Never seen it before, can't alias. - if (WriteObjects[*UI].empty()) - continue; - // Direct alias found. - if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) { - DEBUG(dbgs() << "LV: Found a possible write-write reorder:" - << **UI <<"\n"); - return false; - } - DEBUG(dbgs() << "LV: Found a global value: " - << **UI <<"\n"); - Instruction *Inst = (*MI).second; - DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n"); - DEBUG(dbgs() << "LV: On value:" << *Val <<"\n"); - - // If global alias, make sure they do alias. - if (hasPossibleGlobalWriteReorder(*UI, - Inst, - WriteObjects, - MaxByteWidth)) { - DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI - << "\n"); - return false; - } - } - TempObjects.clear(); - } + return CanVecMem; +} - PtrRtCheck.Need = NeedRTCheck; - if (NeedRTCheck && !CanDoRT) { - DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << - "the array bounds.\n"); - PtrRtCheck.reset(); - return false; +static bool hasMultipleUsesOf(Instruction *I, + SmallPtrSet<Instruction *, 8> &Insts) { + unsigned NumUses = 0; + for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) { + if (Insts.count(dyn_cast<Instruction>(*Use))) + ++NumUses; + if (NumUses > 1) + return true; } - DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") << - " need a runtime memory check.\n"); + return false; +} + +static bool areAllUsesIn(Instruction *I, SmallPtrSet<Instruction *, 8> &Set) { + for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) + if (!Set.count(dyn_cast<Instruction>(*Use))) + return false; return true; } @@ -2909,116 +4081,154 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // This includes users of the reduction, variables (which form a cycle // which ends in the phi node). Instruction *ExitInstruction = 0; - // Indicates that we found a binary operation in our scan. - bool FoundBinOp = false; + // Indicates that we found a reduction operation in our scan. + bool FoundReduxOp = false; - // Iter is our iterator. We start with the PHI node and scan for all of the - // users of this instruction. All users must be instructions that can be - // used as reduction variables (such as ADD). We may have a single - // out-of-block user. The cycle must end with the original PHI. - Instruction *Iter = Phi; + // We start with the PHI node and scan for all of the users of this + // instruction. All users must be instructions that can be used as reduction + // variables (such as ADD). We must have a single out-of-block user. The cycle + // must include the original PHI. + bool FoundStartPHI = false; // To recognize min/max patterns formed by a icmp select sequence, we store // the number of instruction we saw from the recognized min/max pattern, - // such that we don't stop when we see the phi has two uses (one by the select - // and one by the icmp) and to make sure we only see exactly the two - // instructions. + // to make sure we only see exactly the two instructions. unsigned NumCmpSelectPatternInst = 0; ReductionInstDesc ReduxDesc(false, 0); - // Avoid cycles in the chain. SmallPtrSet<Instruction *, 8> VisitedInsts; - while (VisitedInsts.insert(Iter)) { - // If the instruction has no users then this is a broken - // chain and can't be a reduction variable. - if (Iter->use_empty()) + SmallVector<Instruction *, 8> Worklist; + Worklist.push_back(Phi); + VisitedInsts.insert(Phi); + + // A value in the reduction can be used: + // - By the reduction: + // - Reduction operation: + // - One use of reduction value (safe). + // - Multiple use of reduction value (not safe). + // - PHI: + // - All uses of the PHI must be the reduction (safe). + // - Otherwise, not safe. + // - By one instruction outside of the loop (safe). + // - By further instructions outside of the loop (not safe). + // - By an instruction that is not part of the reduction (not safe). + // This is either: + // * An instruction type other than PHI or the reduction operation. + // * A PHI in the header other than the initial PHI. + while (!Worklist.empty()) { + Instruction *Cur = Worklist.back(); + Worklist.pop_back(); + + // No Users. + // If the instruction has no users then this is a broken chain and can't be + // a reduction variable. + if (Cur->use_empty()) return false; - // Did we find a user inside this loop already ? - bool FoundInBlockUser = false; - // Did we reach the initial PHI node already ? - bool FoundStartPHI = false; + bool IsAPhi = isa<PHINode>(Cur); - // Is this a bin op ? - FoundBinOp |= !isa<PHINode>(Iter); + // A header PHI use other than the original PHI. + if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent()) + return false; - // For each of the *users* of iter. - for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); - it != e; ++it) { - Instruction *U = cast<Instruction>(*it); - // We already know that the PHI is a user. - if (U == Phi) { - FoundStartPHI = true; - continue; - } + // Reductions of instructions such as Div, and Sub is only possible if the + // LHS is the reduction variable. + if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) && + !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) && + !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0)))) + return false; + + // Any reduction instruction must be of one of the allowed kinds. + ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc); + if (!ReduxDesc.IsReduction) + return false; + + // A reduction operation must only have one use of the reduction value. + if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax && + hasMultipleUsesOf(Cur, VisitedInsts)) + return false; + + // All inputs to a PHI node must be a reduction value. + if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts)) + return false; + + if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) || + isa<SelectInst>(Cur))) + ++NumCmpSelectPatternInst; + if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) || + isa<SelectInst>(Cur))) + ++NumCmpSelectPatternInst; + + // Check whether we found a reduction operator. + FoundReduxOp |= !IsAPhi; + + // Process users of current instruction. Push non PHI nodes after PHI nodes + // onto the stack. This way we are going to have seen all inputs to PHI + // nodes once we get to them. + SmallVector<Instruction *, 8> NonPHIs; + SmallVector<Instruction *, 8> PHIs; + for (Value::use_iterator UI = Cur->use_begin(), E = Cur->use_end(); UI != E; + ++UI) { + Instruction *Usr = cast<Instruction>(*UI); // Check if we found the exit user. - BasicBlock *Parent = U->getParent(); + BasicBlock *Parent = Usr->getParent(); if (!TheLoop->contains(Parent)) { - // Exit if you find multiple outside users. - if (ExitInstruction != 0) + // Exit if you find multiple outside users or if the header phi node is + // being used. In this case the user uses the value of the previous + // iteration, in which case we would loose "VF-1" iterations of the + // reduction operation if we vectorize. + if (ExitInstruction != 0 || Cur == Phi) return false; - ExitInstruction = Iter; - } - // We allow in-loop PHINodes which are not the original reduction PHI - // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE - // structure) then don't skip this PHI. - if (isa<PHINode>(Iter) && isa<PHINode>(U) && - U->getParent() != TheLoop->getHeader() && - TheLoop->contains(U) && - Iter->hasNUsesOrMore(2)) - continue; + // The instruction used by an outside user must be the last instruction + // before we feed back to the reduction phi. Otherwise, we loose VF-1 + // operations on the value. + if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end()) + return false; - // We can't have multiple inside users except for a combination of - // icmp/select both using the phi. - if (FoundInBlockUser && !NumCmpSelectPatternInst) - return false; - FoundInBlockUser = true; - - // Any reduction instr must be of one of the allowed kinds. - ReduxDesc = isReductionInstr(U, Kind, ReduxDesc); - if (!ReduxDesc.IsReduction) - return false; + ExitInstruction = Cur; + continue; + } - if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(U) || isa<SelectInst>(U))) - ++NumCmpSelectPatternInst; - if (Kind == RK_FloatMinMax && (isa<FCmpInst>(U) || isa<SelectInst>(U))) - ++NumCmpSelectPatternInst; + // Process instructions only once (termination). + if (VisitedInsts.insert(Usr)) { + if (isa<PHINode>(Usr)) + PHIs.push_back(Usr); + else + NonPHIs.push_back(Usr); + } + // Remember that we completed the cycle. + if (Usr == Phi) + FoundStartPHI = true; + } + Worklist.append(PHIs.begin(), PHIs.end()); + Worklist.append(NonPHIs.begin(), NonPHIs.end()); + } - // Reductions of instructions such as Div, and Sub is only - // possible if the LHS is the reduction variable. - if (!U->isCommutative() && !isa<PHINode>(U) && !isa<SelectInst>(U) && - !isa<ICmpInst>(U) && !isa<FCmpInst>(U) && U->getOperand(0) != Iter) - return false; + // This means we have seen one but not the other instruction of the + // pattern or more than just a select and cmp. + if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) && + NumCmpSelectPatternInst != 2) + return false; - Iter = ReduxDesc.PatternLastInst; - } + if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) + return false; - // This means we have seen one but not the other instruction of the - // pattern or more than just a select and cmp. - if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) && - NumCmpSelectPatternInst != 2) - return false; + // We found a reduction var if we have reached the original phi node and we + // only have a single instruction with out-of-loop users. - // We found a reduction var if we have reached the original - // phi node and we only have a single instruction with out-of-loop - // users. - if (FoundStartPHI) { - // This instruction is allowed to have out-of-loop users. - AllowedExit.insert(ExitInstruction); + // This instruction is allowed to have out-of-loop users. + AllowedExit.insert(ExitInstruction); - // Save the description of this reduction variable. - ReductionDescriptor RD(RdxStart, ExitInstruction, Kind, - ReduxDesc.MinMaxKind); - Reductions[Phi] = RD; - // We've ended the cycle. This is a reduction variable if we have an - // outside user and it has a binary op. - return FoundBinOp && ExitInstruction; - } - } + // Save the description of this reduction variable. + ReductionDescriptor RD(RdxStart, ExitInstruction, Kind, + ReduxDesc.MinMaxKind); + Reductions[Phi] = RD; + // We've ended the cycle. This is a reduction variable if we have an + // outside user and it has a binary op. - return false; + return true; } /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction @@ -3169,12 +4379,28 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { return !DT->dominates(BB, Latch); } -bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { +bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, + SmallPtrSet<Value *, 8>& SafePtrs) { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - // We don't predicate loads/stores at the moment. - if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow()) + // We might be able to hoist the load. + if (it->mayReadFromMemory()) { + LoadInst *LI = dyn_cast<LoadInst>(it); + if (!LI || !SafePtrs.count(LI->getPointerOperand())) + return false; + } + + // We don't predicate stores at the moment. + if (it->mayWriteToMemory() || it->mayThrow()) return false; + // Check that we don't have a constant expression that can trap as operand. + for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end(); + OI != OE; ++OI) { + if (Constant *C = dyn_cast<Constant>(*OI)) + if (C->canTrap()) + return false; + } + // The instructions below can trap. switch (it->getOpcode()) { default: continue; @@ -3189,15 +4415,6 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { return true; } -bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { - const SCEV *PhiScev = SE->getSCEV(Ptr); - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); - if (!AR) - return false; - - return AR->isAffine(); -} - LoopVectorizationCostModel::VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, unsigned UserVF) { @@ -3210,13 +4427,19 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, // Find the trip count. unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); - DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n"); + DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); unsigned WidestType = getWidestType(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); + unsigned MaxSafeDepDist = -1U; + if (Legal->getMaxSafeDepDistBytes() != -1U) + MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; + WidestRegister = ((WidestRegister < MaxSafeDepDist) ? + WidestRegister : MaxSafeDepDist); unsigned MaxVectorSize = WidestRegister / WidestType; DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); - DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n"); + DEBUG(dbgs() << "LV: The Widest register is: " + << WidestRegister << " bits.\n"); if (MaxVectorSize == 0) { DEBUG(dbgs() << "LV: The target has no vector registers.\n"); @@ -3252,7 +4475,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, if (UserVF != 0) { assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); - DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n"); + DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); Factor.Width = UserVF; return Factor; @@ -3260,13 +4483,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, float Cost = expectedCost(1); unsigned Width = 1; - DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); + DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n"); for (unsigned i=2; i <= VF; i*=2) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. float VectorCost = expectedCost(i) / (float)i; - DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << + DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"); if (VectorCost < Cost) { Cost = VectorCost; @@ -3347,6 +4570,10 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, if (OptForSize) return 1; + // We used the distance for the unroll factor. + if (Legal->getMaxSafeDepDistBytes() != -1U) + return 1; + // Do not unroll loops with a relatively small trip count. unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); @@ -3386,8 +4613,20 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, else if (UF < 1) UF = 1; - if (Legal->getReductionVars()->size()) { - DEBUG(dbgs() << "LV: Unrolling because of reductions. \n"); + bool HasReductions = Legal->getReductionVars()->size(); + + // Decide if we want to unroll if we decided that it is legal to vectorize + // but not profitable. + if (VF == 1) { + if (TheLoop->getNumBlocks() > 1 || !HasReductions || + LoopCost > SmallLoopCost) + return 1; + + return UF; + } + + if (HasReductions) { + DEBUG(dbgs() << "LV: Unrolling because of reductions.\n"); return UF; } @@ -3395,14 +4634,14 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and unroll until the cost of the // loop overhead is about 5% of the cost of the loop. - DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n"); - if (LoopCost < 20) { - DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n"); - unsigned NewUF = 20/LoopCost + 1; + DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); + if (LoopCost < SmallLoopCost) { + DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n"); + unsigned NewUF = SmallLoopCost / (LoopCost + 1); return std::min(NewUF, UF); } - DEBUG(dbgs() << "LV: Not Unrolling. \n"); + DEBUG(dbgs() << "LV: Not Unrolling.\n"); return 1; } @@ -3503,16 +4742,16 @@ LoopVectorizationCostModel::calculateRegisterUsage() { MaxUsage = std::max(MaxUsage, OpenIntervals.size()); DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << - OpenIntervals.size() <<"\n"); + OpenIntervals.size() << '\n'); // Add the current instruction to the list of open intervals. OpenIntervals.insert(I); } unsigned Invariant = LoopInvariants.size(); - DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n"); - DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n"); - DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n"); + DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'); + DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); + DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'); R.LoopInvariantRegs = Invariant; R.MaxLocalUsers = MaxUsage; @@ -3535,15 +4774,15 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { continue; unsigned C = getInstructionCost(it, VF); - Cost += C; - DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " << - VF << " For instruction: "<< *it << "\n"); + BlockCost += C; + DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " << + VF << " For instruction: " << *it << '\n'); } // We assume that if-converted blocks have a 50% chance of being executed. // When the code is scalar then some of the blocks are avoided due to CF. // When the code is vectorized we execute all code paths. - if (Legal->blockNeedsPredication(*bb) && VF == 1) + if (VF == 1 && Legal->blockNeedsPredication(*bb)) BlockCost /= 2; Cost += BlockCost; @@ -3552,6 +4791,59 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { return Cost; } +/// \brief Check whether the address computation for a non-consecutive memory +/// access looks like an unlikely candidate for being merged into the indexing +/// mode. +/// +/// We look for a GEP which has one index that is an induction variable and all +/// other indices are loop invariant. If the stride of this access is also +/// within a small bound we decide that this address computation can likely be +/// merged into the addressing mode. +/// In all other cases, we identify the address computation as complex. +static bool isLikelyComplexAddressComputation(Value *Ptr, + LoopVectorizationLegality *Legal, + ScalarEvolution *SE, + const Loop *TheLoop) { + GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); + if (!Gep) + return true; + + // We are looking for a gep with all loop invariant indices except for one + // which should be an induction variable. + unsigned NumOperands = Gep->getNumOperands(); + for (unsigned i = 1; i < NumOperands; ++i) { + Value *Opd = Gep->getOperand(i); + if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && + !Legal->isInductionVariable(Opd)) + return true; + } + + // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step + // can likely be merged into the address computation. + unsigned MaxMergeDistance = 64; + + const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr)); + if (!AddRec) + return true; + + // Check the step is constant. + const SCEV *Step = AddRec->getStepRecurrence(*SE); + // Calculate the pointer stride and check if it is consecutive. + const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); + if (!C) + return true; + + const APInt &APStepVal = C->getValue()->getValue(); + + // Huge step value - give up. + if (APStepVal.getBitWidth() > 64) + return true; + + int64_t StepVal = APStepVal.getSExtValue(); + + return StepVal > MaxMergeDistance; +} + unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // If we know that this instruction will remain uniform, check the cost of @@ -3647,6 +4939,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy); unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF; if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { + bool IsComplexComputation = + isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); unsigned Cost = 0; // The cost of extracting from the value vector and pointer vector. Type *PtrTy = ToVectorTy(Ptr->getType(), VF); @@ -3662,7 +4956,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { } // The cost of the scalar loads/stores. - Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType()); + Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, AS); return Cost; @@ -3743,15 +5037,17 @@ Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { - Pass *createLoopVectorizePass() { - return new LoopVectorize(); + Pass *createLoopVectorizePass(bool NoUnrolling) { + return new LoopVectorize(NoUnrolling); } } @@ -3766,3 +5062,96 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { return false; } + + +void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) { + assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + // Holds vector parameters or scalars, in case of uniform vals. + SmallVector<VectorParts, 4> Params; + + setDebugLocFromInst(Builder, Instr); + + // Find all of the vectorized parameters. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + Value *SrcOp = Instr->getOperand(op); + + // If we are accessing the old induction variable, use the new one. + if (SrcOp == OldInduction) { + Params.push_back(getVectorValue(SrcOp)); + continue; + } + + // Try using previously calculated values. + Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); + + // If the src is an instruction that appeared earlier in the basic block + // then it should already be vectorized. + if (SrcInst && OrigLoop->contains(SrcInst)) { + assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); + // The parameter is a vector value from earlier. + Params.push_back(WidenMap.get(SrcInst)); + } else { + // The parameter is a scalar from outside the loop. Maybe even a constant. + VectorParts Scalars; + Scalars.append(UF, SrcOp); + Params.push_back(Scalars); + } + } + + assert(Params.size() == Instr->getNumOperands() && + "Invalid number of operands"); + + // Does this instruction return a value ? + bool IsVoidRetTy = Instr->getType()->isVoidTy(); + + Value *UndefVec = IsVoidRetTy ? 0 : + UndefValue::get(Instr->getType()); + // Create a new entry in the WidenMap and initialize it to Undef or Null. + VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + + // For each vector unroll 'part': + for (unsigned Part = 0; Part < UF; ++Part) { + // For each scalar that we create: + + Instruction *Cloned = Instr->clone(); + if (!IsVoidRetTy) + Cloned->setName(Instr->getName() + ".cloned"); + // Replace the operands of the cloned instructions with extracted scalars. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + Value *Op = Params[op][Part]; + Cloned->setOperand(op, Op); + } + + // Place the cloned scalar in the new loop. + Builder.Insert(Cloned); + + // If the original scalar returns a value we need to place it in a vector + // so that future users will be able to use it. + if (!IsVoidRetTy) + VecResults[Part] = Cloned; + } +} + +void +InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr, + LoopVectorizationLegality*) { + return scalarizeInstruction(Instr); +} + +Value *InnerLoopUnroller::reverseVector(Value *Vec) { + return Vec; +} + +Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { + return V; +} + +Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx, + bool Negate) { + // When unrolling and the VF is 1, we only need to add a simple scalar. + Type *ITy = Val->getType(); + assert(!ITy->isVectorTy() && "Val must be a scalar"); + Constant *C = ConstantInt::get(ITy, StartIdx, Negate); + return Builder.CreateAdd(Val, C, "induction"); +} + diff --git a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index cc30cc9..c72b51f 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -16,18 +16,23 @@ // //===----------------------------------------------------------------------===// #define SV_NAME "slp-vectorizer" -#define DEBUG_TYPE SV_NAME +#define DEBUG_TYPE "SLP" -#include "VecUtils.h" #include "llvm/Transforms/Vectorize.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/Verifier.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" @@ -35,19 +40,1717 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include <algorithm> #include <map> using namespace llvm; static cl::opt<int> -SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, - cl::desc("Only vectorize trees if the gain is above this " - "number. (gain = -cost of vectorization)")); + SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, + cl::desc("Only vectorize if you gain more than this " + "number ")); + +static cl::opt<bool> +ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, + cl::desc("Attempt to vectorize horizontal reductions")); + +static cl::opt<bool> ShouldStartVectorizeHorAtStore( + "slp-vectorize-hor-store", cl::init(false), cl::Hidden, + cl::desc( + "Attempt to vectorize horizontal reductions feeding into a store")); + namespace { +static const unsigned MinVecRegSize = 128; + +static const unsigned RecursionMaxDepth = 12; + +/// A helper class for numbering instructions in multiple blocks. +/// Numbers start at zero for each basic block. +struct BlockNumbering { + + BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {} + + BlockNumbering() : BB(0), Valid(false) {} + + void numberInstructions() { + unsigned Loc = 0; + InstrIdx.clear(); + InstrVec.clear(); + // Number the instructions in the block. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + InstrIdx[it] = Loc++; + InstrVec.push_back(it); + assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation"); + } + Valid = true; + } + + int getIndex(Instruction *I) { + assert(I->getParent() == BB && "Invalid instruction"); + if (!Valid) + numberInstructions(); + assert(InstrIdx.count(I) && "Unknown instruction"); + return InstrIdx[I]; + } + + Instruction *getInstruction(unsigned loc) { + if (!Valid) + numberInstructions(); + assert(InstrVec.size() > loc && "Invalid Index"); + return InstrVec[loc]; + } + + void forget() { Valid = false; } + +private: + /// The block we are numbering. + BasicBlock *BB; + /// Is the block numbered. + bool Valid; + /// Maps instructions to numbers and back. + SmallDenseMap<Instruction *, int> InstrIdx; + /// Maps integers to Instructions. + SmallVector<Instruction *, 32> InstrVec; +}; + +/// \returns the parent basic block if all of the instructions in \p VL +/// are in the same block or null otherwise. +static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { + Instruction *I0 = dyn_cast<Instruction>(VL[0]); + if (!I0) + return 0; + BasicBlock *BB = I0->getParent(); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + if (!I) + return 0; + + if (BB != I->getParent()) + return 0; + } + return BB; +} + +/// \returns True if all of the values in \p VL are constants. +static bool allConstant(ArrayRef<Value *> VL) { + for (unsigned i = 0, e = VL.size(); i < e; ++i) + if (!isa<Constant>(VL[i])) + return false; + return true; +} + +/// \returns True if all of the values in \p VL are identical. +static bool isSplat(ArrayRef<Value *> VL) { + for (unsigned i = 1, e = VL.size(); i < e; ++i) + if (VL[i] != VL[0]) + return false; + return true; +} + +/// \returns The opcode if all of the Instructions in \p VL have the same +/// opcode, or zero. +static unsigned getSameOpcode(ArrayRef<Value *> VL) { + Instruction *I0 = dyn_cast<Instruction>(VL[0]); + if (!I0) + return 0; + unsigned Opcode = I0->getOpcode(); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + if (!I || Opcode != I->getOpcode()) + return 0; + } + return Opcode; +} + +/// \returns \p I after propagating metadata from \p VL. +static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { + Instruction *I0 = cast<Instruction>(VL[0]); + SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; + I0->getAllMetadataOtherThanDebugLoc(Metadata); + + for (unsigned i = 0, n = Metadata.size(); i != n; ++i) { + unsigned Kind = Metadata[i].first; + MDNode *MD = Metadata[i].second; + + for (int i = 1, e = VL.size(); MD && i != e; i++) { + Instruction *I = cast<Instruction>(VL[i]); + MDNode *IMD = I->getMetadata(Kind); + + switch (Kind) { + default: + MD = 0; // Remove unknown metadata + break; + case LLVMContext::MD_tbaa: + MD = MDNode::getMostGenericTBAA(MD, IMD); + break; + case LLVMContext::MD_fpmath: + MD = MDNode::getMostGenericFPMath(MD, IMD); + break; + } + } + I->setMetadata(Kind, MD); + } + return I; +} + +/// \returns The type that all of the values in \p VL have or null if there +/// are different types. +static Type* getSameType(ArrayRef<Value *> VL) { + Type *Ty = VL[0]->getType(); + for (int i = 1, e = VL.size(); i < e; i++) + if (VL[i]->getType() != Ty) + return 0; + + return Ty; +} + +/// \returns True if the ExtractElement instructions in VL can be vectorized +/// to use the original vector. +static bool CanReuseExtract(ArrayRef<Value *> VL) { + assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); + // Check if all of the extracts come from the same vector and from the + // correct offset. + Value *VL0 = VL[0]; + ExtractElementInst *E0 = cast<ExtractElementInst>(VL0); + Value *Vec = E0->getOperand(0); + + // We have to extract from the same vector type. + unsigned NElts = Vec->getType()->getVectorNumElements(); + + if (NElts != VL.size()) + return false; + + // Check that all of the indices extract from the correct offset. + ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1)); + if (!CI || CI->getZExtValue()) + return false; + + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); + ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1)); + + if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) + return false; + } + + return true; +} + +static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right) { + + SmallVector<Value *, 16> OrigLeft, OrigRight; + + bool AllSameOpcodeLeft = true; + bool AllSameOpcodeRight = true; + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + Instruction *I = cast<Instruction>(VL[i]); + Value *V0 = I->getOperand(0); + Value *V1 = I->getOperand(1); + + OrigLeft.push_back(V0); + OrigRight.push_back(V1); + + Instruction *I0 = dyn_cast<Instruction>(V0); + Instruction *I1 = dyn_cast<Instruction>(V1); + + // Check whether all operands on one side have the same opcode. In this case + // we want to preserve the original order and not make things worse by + // reordering. + AllSameOpcodeLeft = I0; + AllSameOpcodeRight = I1; + + if (i && AllSameOpcodeLeft) { + if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) { + if(P0->getOpcode() != I0->getOpcode()) + AllSameOpcodeLeft = false; + } else + AllSameOpcodeLeft = false; + } + if (i && AllSameOpcodeRight) { + if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) { + if(P1->getOpcode() != I1->getOpcode()) + AllSameOpcodeRight = false; + } else + AllSameOpcodeRight = false; + } + + // Sort two opcodes. In the code below we try to preserve the ability to use + // broadcast of values instead of individual inserts. + // vl1 = load + // vl2 = phi + // vr1 = load + // vr2 = vr2 + // = vl1 x vr1 + // = vl2 x vr2 + // If we just sorted according to opcode we would leave the first line in + // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). + // = vl1 x vr1 + // = vr2 x vl2 + // Because vr2 and vr1 are from the same load we loose the opportunity of a + // broadcast for the packed right side in the backend: we have [vr1, vl2] + // instead of [vr1, vr2=vr1]. + if (I0 && I1) { + if(!i && I0->getOpcode() > I1->getOpcode()) { + Left.push_back(I1); + Right.push_back(I0); + } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) { + // Try not to destroy a broad cast for no apparent benefit. + Left.push_back(I1); + Right.push_back(I0); + } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] == I0) { + // Try preserve broadcasts. + Left.push_back(I1); + Right.push_back(I0); + } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) { + // Try preserve broadcasts. + Left.push_back(I1); + Right.push_back(I0); + } else { + Left.push_back(I0); + Right.push_back(I1); + } + continue; + } + // One opcode, put the instruction on the right. + if (I0) { + Left.push_back(V1); + Right.push_back(I0); + continue; + } + Left.push_back(V0); + Right.push_back(V1); + } + + bool LeftBroadcast = isSplat(Left); + bool RightBroadcast = isSplat(Right); + + // Don't reorder if the operands where good to begin with. + if (!(LeftBroadcast || RightBroadcast) && + (AllSameOpcodeRight || AllSameOpcodeLeft)) { + Left = OrigLeft; + Right = OrigRight; + } +} + +/// Bottom Up SLP Vectorizer. +class BoUpSLP { +public: + typedef SmallVector<Value *, 8> ValueList; + typedef SmallVector<Instruction *, 16> InstrList; + typedef SmallPtrSet<Value *, 16> ValueSet; + typedef SmallVector<StoreInst *, 8> StoreList; + + BoUpSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl, + TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li, + DominatorTree *Dt) : + F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), DT(Dt), + Builder(Se->getContext()) { + // Setup the block numbering utility for all of the blocks in the + // function. + for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { + BasicBlock *BB = it; + BlocksNumbers[BB] = BlockNumbering(BB); + } + } + + /// \brief Vectorize the tree that starts with the elements in \p VL. + /// Returns the vectorized root. + Value *vectorizeTree(); + + /// \returns the vectorization cost of the subtree that starts at \p VL. + /// A negative number means that this is profitable. + int getTreeCost(); + + /// Construct a vectorizable tree that starts at \p Roots and is possibly + /// used by a reduction of \p RdxOps. + void buildTree(ArrayRef<Value *> Roots, ValueSet *RdxOps = 0); + + /// Clear the internal data structures that are created by 'buildTree'. + void deleteTree() { + RdxOps = 0; + VectorizableTree.clear(); + ScalarToTreeEntry.clear(); + MustGather.clear(); + ExternalUses.clear(); + MemBarrierIgnoreList.clear(); + } + + /// \returns true if the memory operations A and B are consecutive. + bool isConsecutiveAccess(Value *A, Value *B); + + /// \brief Perform LICM and CSE on the newly generated gather sequences. + void optimizeGatherSequence(); +private: + struct TreeEntry; + + /// \returns the cost of the vectorizable entry. + int getEntryCost(TreeEntry *E); + + /// This is the recursive part of buildTree. + void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth); + + /// Vectorize a single entry in the tree. + Value *vectorizeTree(TreeEntry *E); + + /// Vectorize a single entry in the tree, starting in \p VL. + Value *vectorizeTree(ArrayRef<Value *> VL); + + /// \returns the pointer to the vectorized value if \p VL is already + /// vectorized, or NULL. They may happen in cycles. + Value *alreadyVectorized(ArrayRef<Value *> VL) const; + + /// \brief Take the pointer operand from the Load/Store instruction. + /// \returns NULL if this is not a valid Load/Store instruction. + static Value *getPointerOperand(Value *I); + + /// \brief Take the address space operand from the Load/Store instruction. + /// \returns -1 if this is not a valid Load/Store instruction. + static unsigned getAddressSpaceOperand(Value *I); + + /// \returns the scalarization cost for this type. Scalarization in this + /// context means the creation of vectors from a group of scalars. + int getGatherCost(Type *Ty); + + /// \returns the scalarization cost for this list of values. Assuming that + /// this subtree gets vectorized, we may need to extract the values from the + /// roots. This method calculates the cost of extracting the values. + int getGatherCost(ArrayRef<Value *> VL); + + /// \returns the AA location that is being access by the instruction. + AliasAnalysis::Location getLocation(Instruction *I); + + /// \brief Checks if it is possible to sink an instruction from + /// \p Src to \p Dst. + /// \returns the pointer to the barrier instruction if we can't sink. + Value *getSinkBarrier(Instruction *Src, Instruction *Dst); + + /// \returns the index of the last instruction in the BB from \p VL. + int getLastIndex(ArrayRef<Value *> VL); + + /// \returns the Instruction in the bundle \p VL. + Instruction *getLastInstruction(ArrayRef<Value *> VL); + + /// \brief Set the Builder insert point to one after the last instruction in + /// the bundle + void setInsertPointAfterBundle(ArrayRef<Value *> VL); + + /// \returns a vector from a collection of scalars in \p VL. + Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); + + /// \returns whether the VectorizableTree is fully vectoriable and will + /// be beneficial even the tree height is tiny. + bool isFullyVectorizableTinyTree(); + + struct TreeEntry { + TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0), + NeedToGather(0) {} + + /// \returns true if the scalars in VL are equal to this entry. + bool isSame(ArrayRef<Value *> VL) const { + assert(VL.size() == Scalars.size() && "Invalid size"); + return std::equal(VL.begin(), VL.end(), Scalars.begin()); + } + + /// A vector of scalars. + ValueList Scalars; + + /// The Scalars are vectorized into this value. It is initialized to Null. + Value *VectorizedValue; + + /// The index in the basic block of the last scalar. + int LastScalarIndex; + + /// Do we need to gather this sequence ? + bool NeedToGather; + }; + + /// Create a new VectorizableTree entry. + TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) { + VectorizableTree.push_back(TreeEntry()); + int idx = VectorizableTree.size() - 1; + TreeEntry *Last = &VectorizableTree[idx]; + Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); + Last->NeedToGather = !Vectorized; + if (Vectorized) { + Last->LastScalarIndex = getLastIndex(VL); + for (int i = 0, e = VL.size(); i != e; ++i) { + assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); + ScalarToTreeEntry[VL[i]] = idx; + } + } else { + Last->LastScalarIndex = 0; + MustGather.insert(VL.begin(), VL.end()); + } + return Last; + } + + /// -- Vectorization State -- + /// Holds all of the tree entries. + std::vector<TreeEntry> VectorizableTree; + + /// Maps a specific scalar to its tree entry. + SmallDenseMap<Value*, int> ScalarToTreeEntry; + + /// A list of scalars that we found that we need to keep as scalars. + ValueSet MustGather; + + /// This POD struct describes one external user in the vectorized tree. + struct ExternalUser { + ExternalUser (Value *S, llvm::User *U, int L) : + Scalar(S), User(U), Lane(L){}; + // Which scalar in our function. + Value *Scalar; + // Which user that uses the scalar. + llvm::User *User; + // Which lane does the scalar belong to. + int Lane; + }; + typedef SmallVector<ExternalUser, 16> UserList; + + /// A list of values that need to extracted out of the tree. + /// This list holds pairs of (Internal Scalar : External User). + UserList ExternalUses; + + /// A list of instructions to ignore while sinking + /// memory instructions. This map must be reset between runs of getCost. + ValueSet MemBarrierIgnoreList; + + /// Holds all of the instructions that we gathered. + SetVector<Instruction *> GatherSeq; + /// A list of blocks that we are going to CSE. + SmallSet<BasicBlock *, 8> CSEBlocks; + + /// Numbers instructions in different blocks. + DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers; + + /// Reduction operators. + ValueSet *RdxOps; + + // Analysis and block reference. + Function *F; + ScalarEvolution *SE; + DataLayout *DL; + TargetTransformInfo *TTI; + AliasAnalysis *AA; + LoopInfo *LI; + DominatorTree *DT; + /// Instruction builder to construct the vectorized tree. + IRBuilder<> Builder; +}; + +void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) { + deleteTree(); + RdxOps = Rdx; + if (!getSameType(Roots)) + return; + buildTree_rec(Roots, 0); + + // Collect the values that we need to extract from the tree. + for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { + TreeEntry *Entry = &VectorizableTree[EIdx]; + + // For each lane: + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; + + // No need to handle users of gathered values. + if (Entry->NeedToGather) + continue; + + for (Value::use_iterator User = Scalar->use_begin(), + UE = Scalar->use_end(); User != UE; ++User) { + DEBUG(dbgs() << "SLP: Checking user:" << **User << ".\n"); + + // Skip in-tree scalars that become vectors. + if (ScalarToTreeEntry.count(*User)) { + DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << + **User << ".\n"); + int Idx = ScalarToTreeEntry[*User]; (void) Idx; + assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); + continue; + } + Instruction *UserInst = dyn_cast<Instruction>(*User); + if (!UserInst) + continue; + + // Ignore uses that are part of the reduction. + if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end()) + continue; + + DEBUG(dbgs() << "SLP: Need to extract:" << **User << " from lane " << + Lane << " from " << *Scalar << ".\n"); + ExternalUses.push_back(ExternalUser(Scalar, *User, Lane)); + } + } + } +} + + +void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { + bool SameTy = getSameType(VL); (void)SameTy; + assert(SameTy && "Invalid types!"); + + if (Depth == RecursionMaxDepth) { + DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); + newTreeEntry(VL, false); + return; + } + + // Don't handle vectors. + if (VL[0]->getType()->isVectorTy()) { + DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); + newTreeEntry(VL, false); + return; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + if (SI->getValueOperand()->getType()->isVectorTy()) { + DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); + newTreeEntry(VL, false); + return; + } + + // If all of the operands are identical or constant we have a simple solution. + if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || + !getSameOpcode(VL)) { + DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); + newTreeEntry(VL, false); + return; + } + + // We now know that this is a vector of instructions of the same type from + // the same block. + + // Check if this is a duplicate of another entry. + if (ScalarToTreeEntry.count(VL[0])) { + int Idx = ScalarToTreeEntry[VL[0]]; + TreeEntry *E = &VectorizableTree[Idx]; + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); + if (E->Scalars[i] != VL[i]) { + DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); + newTreeEntry(VL, false); + return; + } + } + DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); + return; + } + + // Check that none of the instructions in the bundle are already in the tree. + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + if (ScalarToTreeEntry.count(VL[i])) { + DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << + ") is already in tree.\n"); + newTreeEntry(VL, false); + return; + } + } + + // If any of the scalars appears in the table OR it is marked as a value that + // needs to stat scalar then we need to gather the scalars. + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) { + DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n"); + newTreeEntry(VL, false); + return; + } + } + + // Check that all of the users of the scalars that we want to vectorize are + // schedulable. + Instruction *VL0 = cast<Instruction>(VL[0]); + int MyLastIndex = getLastIndex(VL); + BasicBlock *BB = cast<Instruction>(VL0)->getParent(); + + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + Instruction *Scalar = cast<Instruction>(VL[i]); + DEBUG(dbgs() << "SLP: Checking users of " << *Scalar << ". \n"); + for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end(); + U != UE; ++U) { + DEBUG(dbgs() << "SLP: \tUser " << **U << ". \n"); + Instruction *User = dyn_cast<Instruction>(*U); + if (!User) { + DEBUG(dbgs() << "SLP: Gathering due unknown user. \n"); + newTreeEntry(VL, false); + return; + } + + // We don't care if the user is in a different basic block. + BasicBlock *UserBlock = User->getParent(); + if (UserBlock != BB) { + DEBUG(dbgs() << "SLP: User from a different basic block " + << *User << ". \n"); + continue; + } + + // If this is a PHINode within this basic block then we can place the + // extract wherever we want. + if (isa<PHINode>(*User)) { + DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *User << ". \n"); + continue; + } + + // Check if this is a safe in-tree user. + if (ScalarToTreeEntry.count(User)) { + int Idx = ScalarToTreeEntry[User]; + int VecLocation = VectorizableTree[Idx].LastScalarIndex; + if (VecLocation <= MyLastIndex) { + DEBUG(dbgs() << "SLP: Gathering due to unschedulable vector. \n"); + newTreeEntry(VL, false); + return; + } + DEBUG(dbgs() << "SLP: In-tree user (" << *User << ") at #" << + VecLocation << " vector value (" << *Scalar << ") at #" + << MyLastIndex << ".\n"); + continue; + } + + // This user is part of the reduction. + if (RdxOps && RdxOps->count(User)) + continue; + + // Make sure that we can schedule this unknown user. + BlockNumbering &BN = BlocksNumbers[BB]; + int UserIndex = BN.getIndex(User); + if (UserIndex < MyLastIndex) { + + DEBUG(dbgs() << "SLP: Can't schedule extractelement for " + << *User << ". \n"); + newTreeEntry(VL, false); + return; + } + } + } + + // Check that every instructions appears once in this bundle. + for (unsigned i = 0, e = VL.size(); i < e; ++i) + for (unsigned j = i+1; j < e; ++j) + if (VL[i] == VL[j]) { + DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + newTreeEntry(VL, false); + return; + } + + // Check that instructions in this bundle don't reference other instructions. + // The runtime of this check is O(N * N-1 * uses(N)) and a typical N is 4. + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); + U != UE; ++U) { + for (unsigned j = 0; j < e; ++j) { + if (i != j && *U == VL[j]) { + DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << **U << ". \n"); + newTreeEntry(VL, false); + return; + } + } + } + } + + DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); + + unsigned Opcode = getSameOpcode(VL); + + // Check if it is safe to sink the loads or the stores. + if (Opcode == Instruction::Load || Opcode == Instruction::Store) { + Instruction *Last = getLastInstruction(VL); + + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + if (VL[i] == Last) + continue; + Value *Barrier = getSinkBarrier(cast<Instruction>(VL[i]), Last); + if (Barrier) { + DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last + << "\n because of " << *Barrier << ". Gathering.\n"); + newTreeEntry(VL, false); + return; + } + } + } + + switch (Opcode) { + case Instruction::PHI: { + PHINode *PH = dyn_cast<PHINode>(VL0); + + // Check for terminator values (e.g. invoke). + for (unsigned j = 0; j < VL.size(); ++j) + for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + TerminatorInst *Term = dyn_cast<TerminatorInst>(cast<PHINode>(VL[j])->getIncomingValue(i)); + if (Term) { + DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); + newTreeEntry(VL, false); + return; + } + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); + + for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i)); + + buildTree_rec(Operands, Depth + 1); + } + return; + } + case Instruction::ExtractElement: { + bool Reuse = CanReuseExtract(VL); + if (Reuse) { + DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); + } + newTreeEntry(VL, Reuse); + return; + } + case Instruction::Load: { + // Check if the loads are consecutive or of we need to swizzle them. + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { + LoadInst *L = cast<LoadInst>(VL[i]); + if (!L->isSimple() || !isConsecutiveAccess(VL[i], VL[i + 1])) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Need to swizzle loads.\n"); + return; + } + } + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of loads.\n"); + return; + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + for (unsigned i = 0; i < VL.size(); ++i) { + Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); + if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); + return; + } + } + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of casts.\n"); + + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth+1); + } + return; + } + case Instruction::ICmp: + case Instruction::FCmp: { + // Check that all of the compares have the same predicate. + CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); + Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType(); + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + CmpInst *Cmp = cast<CmpInst>(VL[i]); + if (Cmp->getPredicate() != P0 || + Cmp->getOperand(0)->getType() != ComparedTy) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); + return; + } + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of compares.\n"); + + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth+1); + } + return; + } + case Instruction::Select: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); + + // Sort operands of the instructions so that each side is more likely to + // have the same opcode. + if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { + ValueList Left, Right; + reorderInputsAccordingToOpcode(VL, Left, Right); + buildTree_rec(Left, Depth + 1); + buildTree_rec(Right, Depth + 1); + return; + } + + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth+1); + } + return; + } + case Instruction::Store: { + // Check if the stores are consecutive or of we need to swizzle them. + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i + 1])) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Non consecutive store.\n"); + return; + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of stores.\n"); + + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); + + // We can ignore these values because we are sinking them down. + MemBarrierIgnoreList.insert(VL.begin(), VL.end()); + buildTree_rec(Operands, Depth + 1); + return; + } + default: + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); + return; + } +} + +int BoUpSLP::getEntryCost(TreeEntry *E) { + ArrayRef<Value*> VL = E->Scalars; + + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + + if (E->NeedToGather) { + if (allConstant(VL)) + return 0; + if (isSplat(VL)) { + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); + } + return getGatherCost(E->Scalars); + } + + assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) && + "Invalid VL"); + Instruction *VL0 = cast<Instruction>(VL[0]); + unsigned Opcode = VL0->getOpcode(); + switch (Opcode) { + case Instruction::PHI: { + return 0; + } + case Instruction::ExtractElement: { + if (CanReuseExtract(VL)) + return 0; + return getGatherCost(VecTy); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + + // Calculate the cost of this instruction. + int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), + VL0->getType(), SrcTy); + + VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); + int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); + return VecCost - ScalarCost; + } + case Instruction::FCmp: + case Instruction::ICmp: + case Instruction::Select: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Calculate the cost of this instruction. + int ScalarCost = 0; + int VecCost = 0; + if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || + Opcode == Instruction::Select) { + VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); + ScalarCost = VecTy->getNumElements() * + TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); + VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); + } else { + // Certain instructions can be cheaper to vectorize if they have a + // constant second vector operand. + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_UniformConstantValue; + + // Check whether all second operands are constant. + for (unsigned i = 0; i < VL.size(); ++i) + if (!isa<ConstantInt>(cast<Instruction>(VL[i])->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_AnyValue; + break; + } + + ScalarCost = + VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK); + VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK); + } + return VecCost - ScalarCost; + } + case Instruction::Load: { + // Cost of wide load - cost of scalar loads. + int ScalarLdCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0); + return VecLdCost - ScalarLdCost; + } + case Instruction::Store: { + // We know that we can merge the stores. Calculate the cost. + int ScalarStCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0); + return VecStCost - ScalarStCost; + } + default: + llvm_unreachable("Unknown instruction"); + } +} + +bool BoUpSLP::isFullyVectorizableTinyTree() { + DEBUG(dbgs() << "SLP: Check whether the tree with height " << + VectorizableTree.size() << " is fully vectorizable .\n"); + + // We only handle trees of height 2. + if (VectorizableTree.size() != 2) + return false; + + // Gathering cost would be too much for tiny trees. + if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) + return false; + + return true; +} + +int BoUpSLP::getTreeCost() { + int Cost = 0; + DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << + VectorizableTree.size() << ".\n"); + + // We only vectorize tiny trees if it is fully vectorizable. + if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) { + if (!VectorizableTree.size()) { + assert(!ExternalUses.size() && "We should not have any external users"); + } + return INT_MAX; + } + + unsigned BundleWidth = VectorizableTree[0].Scalars.size(); + + for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) { + int C = getEntryCost(&VectorizableTree[i]); + DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " + << *VectorizableTree[i].Scalars[0] << " .\n"); + Cost += C; + } + + int ExtractCost = 0; + for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end(); + I != E; ++I) { + + VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth); + ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, + I->Lane); + } + + + DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n"); + return Cost + ExtractCost; +} + +int BoUpSLP::getGatherCost(Type *Ty) { + int Cost = 0; + for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) + Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + return Cost; +} + +int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { + // Find the type of the operands in VL. + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + // Find the cost of inserting/extracting values from the vector. + return getGatherCost(VecTy); +} + +AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) { + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return AA->getLocation(SI); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return AA->getLocation(LI); + return AliasAnalysis::Location(); +} + +Value *BoUpSLP::getPointerOperand(Value *I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return LI->getPointerOperand(); + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return SI->getPointerOperand(); + return 0; +} + +unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { + if (LoadInst *L = dyn_cast<LoadInst>(I)) + return L->getPointerAddressSpace(); + if (StoreInst *S = dyn_cast<StoreInst>(I)) + return S->getPointerAddressSpace(); + return -1; +} + +bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { + Value *PtrA = getPointerOperand(A); + Value *PtrB = getPointerOperand(B); + unsigned ASA = getAddressSpaceOperand(A); + unsigned ASB = getAddressSpaceOperand(B); + + // Check that the address spaces match and that the pointers are valid. + if (!PtrA || !PtrB || (ASA != ASB)) + return false; + + // Make sure that A and B are different pointers of the same type. + if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) + return false; + + unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA); + Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); + APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty)); + + APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); + PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA); + PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB); + + APInt OffsetDelta = OffsetB - OffsetA; + + // Check if they are based on the same pointer. That makes the offsets + // sufficient. + if (PtrA == PtrB) + return OffsetDelta == Size; + + // Compute the necessary base pointer delta to have the necessary final delta + // equal to the size. + APInt BaseDelta = Size - OffsetDelta; + + // Otherwise compute the distance with SCEV between the base pointers. + const SCEV *PtrSCEVA = SE->getSCEV(PtrA); + const SCEV *PtrSCEVB = SE->getSCEV(PtrB); + const SCEV *C = SE->getConstant(BaseDelta); + const SCEV *X = SE->getAddExpr(PtrSCEVA, C); + return X == PtrSCEVB; +} + +Value *BoUpSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { + assert(Src->getParent() == Dst->getParent() && "Not the same BB"); + BasicBlock::iterator I = Src, E = Dst; + /// Scan all of the instruction from SRC to DST and check if + /// the source may alias. + for (++I; I != E; ++I) { + // Ignore store instructions that are marked as 'ignore'. + if (MemBarrierIgnoreList.count(I)) + continue; + if (Src->mayWriteToMemory()) /* Write */ { + if (!I->mayReadOrWriteMemory()) + continue; + } else /* Read */ { + if (!I->mayWriteToMemory()) + continue; + } + AliasAnalysis::Location A = getLocation(&*I); + AliasAnalysis::Location B = getLocation(Src); + + if (!A.Ptr || !B.Ptr || AA->alias(A, B)) + return I; + } + return 0; +} + +int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) { + BasicBlock *BB = cast<Instruction>(VL[0])->getParent(); + assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); + BlockNumbering &BN = BlocksNumbers[BB]; + + int MaxIdx = BN.getIndex(BB->getFirstNonPHI()); + for (unsigned i = 0, e = VL.size(); i < e; ++i) + MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i]))); + return MaxIdx; +} + +Instruction *BoUpSLP::getLastInstruction(ArrayRef<Value *> VL) { + BasicBlock *BB = cast<Instruction>(VL[0])->getParent(); + assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); + BlockNumbering &BN = BlocksNumbers[BB]; + + int MaxIdx = BN.getIndex(cast<Instruction>(VL[0])); + for (unsigned i = 1, e = VL.size(); i < e; ++i) + MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i]))); + Instruction *I = BN.getInstruction(MaxIdx); + assert(I && "bad location"); + return I; +} + +void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { + Instruction *VL0 = cast<Instruction>(VL[0]); + Instruction *LastInst = getLastInstruction(VL); + BasicBlock::iterator NextInst = LastInst; + ++NextInst; + Builder.SetInsertPoint(VL0->getParent(), NextInst); + Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); +} + +Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { + Value *Vec = UndefValue::get(Ty); + // Generate the 'InsertElement' instruction. + for (unsigned i = 0; i < Ty->getNumElements(); ++i) { + Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); + if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) { + GatherSeq.insert(Insrt); + CSEBlocks.insert(Insrt->getParent()); + + // Add to our 'need-to-extract' list. + if (ScalarToTreeEntry.count(VL[i])) { + int Idx = ScalarToTreeEntry[VL[i]]; + TreeEntry *E = &VectorizableTree[Idx]; + // Find which lane we need to extract. + int FoundLane = -1; + for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { + // Is this the lane of the scalar that we are looking for ? + if (E->Scalars[Lane] == VL[i]) { + FoundLane = Lane; + break; + } + } + assert(FoundLane >= 0 && "Could not find the correct lane"); + ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane)); + } + } + } + + return Vec; +} + +Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const { + SmallDenseMap<Value*, int>::const_iterator Entry + = ScalarToTreeEntry.find(VL[0]); + if (Entry != ScalarToTreeEntry.end()) { + int Idx = Entry->second; + const TreeEntry *En = &VectorizableTree[Idx]; + if (En->isSame(VL) && En->VectorizedValue) + return En->VectorizedValue; + } + return 0; +} + +Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { + if (ScalarToTreeEntry.count(VL[0])) { + int Idx = ScalarToTreeEntry[VL[0]]; + TreeEntry *E = &VectorizableTree[Idx]; + if (E->isSame(VL)) + return vectorizeTree(E); + } + + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + + return Gather(VL, VecTy); +} + +Value *BoUpSLP::vectorizeTree(TreeEntry *E) { + IRBuilder<>::InsertPointGuard Guard(Builder); + + if (E->VectorizedValue) { + DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); + return E->VectorizedValue; + } + + Instruction *VL0 = cast<Instruction>(E->Scalars[0]); + Type *ScalarTy = VL0->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); + + if (E->NeedToGather) { + setInsertPointAfterBundle(E->Scalars); + return Gather(E->Scalars, VecTy); + } + + unsigned Opcode = VL0->getOpcode(); + assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode"); + + switch (Opcode) { + case Instruction::PHI: { + PHINode *PH = dyn_cast<PHINode>(VL0); + Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); + Builder.SetCurrentDebugLocation(PH->getDebugLoc()); + PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); + E->VectorizedValue = NewPhi; + + // PHINodes may have multiple entries from the same block. We want to + // visit every block once. + SmallSet<BasicBlock*, 4> VisitedBBs; + + for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + ValueList Operands; + BasicBlock *IBB = PH->getIncomingBlock(i); + + if (!VisitedBBs.insert(IBB)) { + NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); + continue; + } + + // Prepare the operand vector. + for (unsigned j = 0; j < E->Scalars.size(); ++j) + Operands.push_back(cast<PHINode>(E->Scalars[j])-> + getIncomingValueForBlock(IBB)); + + Builder.SetInsertPoint(IBB->getTerminator()); + Builder.SetCurrentDebugLocation(PH->getDebugLoc()); + Value *Vec = vectorizeTree(Operands); + NewPhi->addIncoming(Vec, IBB); + } + + assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && + "Invalid number of incoming values"); + return NewPhi; + } + + case Instruction::ExtractElement: { + if (CanReuseExtract(E->Scalars)) { + Value *V = VL0->getOperand(0); + E->VectorizedValue = V; + return V; + } + return Gather(E->Scalars, VecTy); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + ValueList INVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + + setInsertPointAfterBundle(E->Scalars); + + Value *InVec = vectorizeTree(INVL); + + if (Value *V = alreadyVectorized(E->Scalars)) + return V; + + CastInst *CI = dyn_cast<CastInst>(VL0); + Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); + E->VectorizedValue = V; + return V; + } + case Instruction::FCmp: + case Instruction::ICmp: { + ValueList LHSV, RHSV; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + } + + setInsertPointAfterBundle(E->Scalars); + + Value *L = vectorizeTree(LHSV); + Value *R = vectorizeTree(RHSV); + + if (Value *V = alreadyVectorized(E->Scalars)) + return V; + + CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); + Value *V; + if (Opcode == Instruction::FCmp) + V = Builder.CreateFCmp(P0, L, R); + else + V = Builder.CreateICmp(P0, L, R); + + E->VectorizedValue = V; + return V; + } + case Instruction::Select: { + ValueList TrueVec, FalseVec, CondVec; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2)); + } + + setInsertPointAfterBundle(E->Scalars); + + Value *Cond = vectorizeTree(CondVec); + Value *True = vectorizeTree(TrueVec); + Value *False = vectorizeTree(FalseVec); + + if (Value *V = alreadyVectorized(E->Scalars)) + return V; + + Value *V = Builder.CreateSelect(Cond, True, False); + E->VectorizedValue = V; + return V; + } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + ValueList LHSVL, RHSVL; + if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) + reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); + else + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + } + + setInsertPointAfterBundle(E->Scalars); + + Value *LHS = vectorizeTree(LHSVL); + Value *RHS = vectorizeTree(RHSVL); + + if (LHS == RHS && isa<Instruction>(LHS)) { + assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); + } + + if (Value *V = alreadyVectorized(E->Scalars)) + return V; + + BinaryOperator *BinOp = cast<BinaryOperator>(VL0); + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); + E->VectorizedValue = V; + + if (Instruction *I = dyn_cast<Instruction>(V)) + return propagateMetadata(I, E->Scalars); + + return V; + } + case Instruction::Load: { + // Loads are inserted at the head of the tree because we don't want to + // sink them all the way down past store instructions. + setInsertPointAfterBundle(E->Scalars); + + LoadInst *LI = cast<LoadInst>(VL0); + unsigned AS = LI->getPointerAddressSpace(); + + Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), + VecTy->getPointerTo(AS)); + unsigned Alignment = LI->getAlignment(); + LI = Builder.CreateLoad(VecPtr); + LI->setAlignment(Alignment); + E->VectorizedValue = LI; + return propagateMetadata(LI, E->Scalars); + } + case Instruction::Store: { + StoreInst *SI = cast<StoreInst>(VL0); + unsigned Alignment = SI->getAlignment(); + unsigned AS = SI->getPointerAddressSpace(); + + ValueList ValueOp; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand()); + + setInsertPointAfterBundle(E->Scalars); + + Value *VecValue = vectorizeTree(ValueOp); + Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), + VecTy->getPointerTo(AS)); + StoreInst *S = Builder.CreateStore(VecValue, VecPtr); + S->setAlignment(Alignment); + E->VectorizedValue = S; + return propagateMetadata(S, E->Scalars); + } + default: + llvm_unreachable("unknown inst"); + } + return 0; +} + +Value *BoUpSLP::vectorizeTree() { + Builder.SetInsertPoint(F->getEntryBlock().begin()); + vectorizeTree(&VectorizableTree[0]); + + DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); + + // Extract all of the elements with the external uses. + for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end(); + it != e; ++it) { + Value *Scalar = it->Scalar; + llvm::User *User = it->User; + + // Skip users that we already RAUW. This happens when one instruction + // has multiple uses of the same value. + if (std::find(Scalar->use_begin(), Scalar->use_end(), User) == + Scalar->use_end()) + continue; + assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); + + int Idx = ScalarToTreeEntry[Scalar]; + TreeEntry *E = &VectorizableTree[Idx]; + assert(!E->NeedToGather && "Extracting from a gather list"); + + Value *Vec = E->VectorizedValue; + assert(Vec && "Can't find vectorizable value"); + + Value *Lane = Builder.getInt32(it->Lane); + // Generate extracts for out-of-tree users. + // Find the insertion point for the extractelement lane. + if (PHINode *PN = dyn_cast<PHINode>(Vec)) { + Builder.SetInsertPoint(PN->getParent()->getFirstInsertionPt()); + Value *Ex = Builder.CreateExtractElement(Vec, Lane); + CSEBlocks.insert(PN->getParent()); + User->replaceUsesOfWith(Scalar, Ex); + } else if (isa<Instruction>(Vec)){ + if (PHINode *PH = dyn_cast<PHINode>(User)) { + for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { + if (PH->getIncomingValue(i) == Scalar) { + Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); + Value *Ex = Builder.CreateExtractElement(Vec, Lane); + CSEBlocks.insert(PH->getIncomingBlock(i)); + PH->setOperand(i, Ex); + } + } + } else { + Builder.SetInsertPoint(cast<Instruction>(User)); + Value *Ex = Builder.CreateExtractElement(Vec, Lane); + CSEBlocks.insert(cast<Instruction>(User)->getParent()); + User->replaceUsesOfWith(Scalar, Ex); + } + } else { + Builder.SetInsertPoint(F->getEntryBlock().begin()); + Value *Ex = Builder.CreateExtractElement(Vec, Lane); + CSEBlocks.insert(&F->getEntryBlock()); + User->replaceUsesOfWith(Scalar, Ex); + } + + DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); + } + + // For each vectorized value: + for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { + TreeEntry *Entry = &VectorizableTree[EIdx]; + + // For each lane: + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; + + // No need to handle users of gathered values. + if (Entry->NeedToGather) + continue; + + assert(Entry->VectorizedValue && "Can't find vectorizable value"); + + Type *Ty = Scalar->getType(); + if (!Ty->isVoidTy()) { + for (Value::use_iterator User = Scalar->use_begin(), + UE = Scalar->use_end(); User != UE; ++User) { + DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n"); + + assert((ScalarToTreeEntry.count(*User) || + // It is legal to replace the reduction users by undef. + (RdxOps && RdxOps->count(*User))) && + "Replacing out-of-tree value with undef"); + } + Value *Undef = UndefValue::get(Ty); + Scalar->replaceAllUsesWith(Undef); + } + DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); + cast<Instruction>(Scalar)->eraseFromParent(); + } + } + + for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { + BlocksNumbers[it].forget(); + } + Builder.ClearInsertionPoint(); + + return VectorizableTree[0].VectorizedValue; +} + +class DTCmp { + const DominatorTree *DT; + +public: + DTCmp(const DominatorTree *DT) : DT(DT) {} + bool operator()(const BasicBlock *A, const BasicBlock *B) const { + return DT->properlyDominates(A, B); + } +}; + +void BoUpSLP::optimizeGatherSequence() { + DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() + << " gather sequences instructions.\n"); + // LICM InsertElementInst sequences. + for (SetVector<Instruction *>::iterator it = GatherSeq.begin(), + e = GatherSeq.end(); it != e; ++it) { + InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it); + + if (!Insert) + continue; + + // Check if this block is inside a loop. + Loop *L = LI->getLoopFor(Insert->getParent()); + if (!L) + continue; + + // Check if it has a preheader. + BasicBlock *PreHeader = L->getLoopPreheader(); + if (!PreHeader) + continue; + + // If the vector or the element that we insert into it are + // instructions that are defined in this basic block then we can't + // hoist this instruction. + Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0)); + Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1)); + if (CurrVec && L->contains(CurrVec)) + continue; + if (NewElem && L->contains(NewElem)) + continue; + + // We can hoist this instruction. Move it to the pre-header. + Insert->moveBefore(PreHeader->getTerminator()); + } + + // Sort blocks by domination. This ensures we visit a block after all blocks + // dominating it are visited. + SmallVector<BasicBlock *, 8> CSEWorkList(CSEBlocks.begin(), CSEBlocks.end()); + std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), DTCmp(DT)); + + // Perform O(N^2) search over the gather sequences and merge identical + // instructions. TODO: We can further optimize this scan if we split the + // instructions into different buckets based on the insert lane. + SmallVector<Instruction *, 16> Visited; + for (SmallVectorImpl<BasicBlock *>::iterator I = CSEWorkList.begin(), + E = CSEWorkList.end(); + I != E; ++I) { + assert((I == CSEWorkList.begin() || !DT->dominates(*I, *llvm::prior(I))) && + "Worklist not sorted properly!"); + BasicBlock *BB = *I; + // For all instructions in blocks containing gather sequences: + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { + Instruction *In = it++; + if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) + continue; + + // Check if we can replace this instruction with any of the + // visited instructions. + for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(), + ve = Visited.end(); + v != ve; ++v) { + if (In->isIdenticalTo(*v) && + DT->dominates((*v)->getParent(), In->getParent())) { + In->replaceAllUsesWith(*v); + In->eraseFromParent(); + In = 0; + break; + } + } + if (In) { + assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end()); + Visited.push_back(In); + } + } + } + CSEBlocks.clear(); + GatherSeq.clear(); +} + /// The SLPVectorizer Pass. struct SLPVectorizer : public FunctionPass { - typedef std::map<Value*, BoUpSLP::StoreList> StoreListMap; + typedef SmallVector<StoreInst *, 8> StoreList; + typedef MapVector<Value *, StoreList> StoreListMap; /// Pass identification, replacement for typeid static char ID; @@ -61,6 +1764,7 @@ struct SLPVectorizer : public FunctionPass { TargetTransformInfo *TTI; AliasAnalysis *AA; LoopInfo *LI; + DominatorTree *DT; virtual bool runOnFunction(Function &F) { SE = &getAnalysis<ScalarEvolution>(); @@ -68,41 +1772,50 @@ struct SLPVectorizer : public FunctionPass { TTI = &getAnalysis<TargetTransformInfo>(); AA = &getAnalysis<AliasAnalysis>(); LI = &getAnalysis<LoopInfo>(); + DT = &getAnalysis<DominatorTree>(); StoreRefs.clear(); bool Changed = false; + // If the target claims to have no vector registers don't attempt + // vectorization. + if (!TTI->getNumberOfRegisters(true)) + return false; + // Must have DataLayout. We can't require it because some tests run w/o // triple. if (!DL) return false; - for (Function::iterator it = F.begin(), e = F.end(); it != e; ++it) { - BasicBlock *BB = it; - bool BBChanged = false; + // Don't vectorize when the attribute NoImplicitFloat is used. + if (F.hasFnAttribute(Attribute::NoImplicitFloat)) + return false; - // Use the bollom up slp vectorizer to construct chains that start with - // he store instructions. - BoUpSLP R(BB, SE, DL, TTI, AA, LI->getLoopFor(BB)); + DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); - // Vectorize trees that end at reductions. - BBChanged |= vectorizeReductions(BB, R); + // Use the bollom up slp vectorizer to construct chains that start with + // he store instructions. + BoUpSLP R(&F, SE, DL, TTI, AA, LI, DT); + + // Scan the blocks in the function in post order. + for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), + e = po_end(&F.getEntryBlock()); it != e; ++it) { + BasicBlock *BB = *it; // Vectorize trees that end at stores. if (unsigned count = collectStores(BB, R)) { (void)count; - DEBUG(dbgs()<<"SLP: Found " << count << " stores to vectorize.\n"); - BBChanged |= vectorizeStoreChains(R); + DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n"); + Changed |= vectorizeStoreChains(R); } - // Try to hoist some of the scalarization code to the preheader. - if (BBChanged) hoistGatherSequence(LI, BB, R); - - Changed |= BBChanged; + // Vectorize trees that end at reductions. + Changed |= vectorizeChainsInBlock(BB, R); } if (Changed) { - DEBUG(dbgs()<<"SLP: vectorized \""<<F.getName()<<"\"\n"); + R.optimizeGatherSequence(); + DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); DEBUG(verifyFunction(F)); } return Changed; @@ -114,6 +1827,10 @@ struct SLPVectorizer : public FunctionPass { AU.addRequired<AliasAnalysis>(); AU.addRequired<TargetTransformInfo>(); AU.addRequired<LoopInfo>(); + AU.addRequired<DominatorTree>(); + AU.addPreserved<LoopInfo>(); + AU.addPreserved<DominatorTree>(); + AU.setPreservesCFG(); } private: @@ -125,29 +1842,149 @@ private: unsigned collectStores(BasicBlock *BB, BoUpSLP &R); /// \brief Try to vectorize a chain that starts at two arithmetic instrs. - bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); + bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); /// \brief Try to vectorize a list of operands. + /// \returns true if a value was vectorized. bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R); /// \brief Try to vectorize a chain that may start at the operands of \V; - bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); + bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); /// \brief Vectorize the stores that were collected in StoreRefs. bool vectorizeStoreChains(BoUpSLP &R); - /// \brief Try to hoist gather sequences outside of the loop in cases where - /// all of the sources are loop invariant. - void hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, BoUpSLP &R); + /// \brief Scan the basic block and look for patterns that are likely to start + /// a vectorization chain. + bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); - /// \brief Scan the basic block and look for reductions that may start a - /// vectorization chain. - bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R); + bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold, + BoUpSLP &R); + bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, + BoUpSLP &R); private: StoreListMap StoreRefs; }; +/// \brief Check that the Values in the slice in VL array are still existant in +/// the WeakVH array. +/// Vectorization of part of the VL array may cause later values in the VL array +/// to become invalid. We track when this has happened in the WeakVH array. +static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL, + SmallVectorImpl<WeakVH> &VH, + unsigned SliceBegin, + unsigned SliceSize) { + for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i) + if (VH[i] != VL[i]) + return true; + + return false; +} + +bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, + int CostThreshold, BoUpSLP &R) { + unsigned ChainLen = Chain.size(); + DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen + << "\n"); + Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); + unsigned Sz = DL->getTypeSizeInBits(StoreTy); + unsigned VF = MinVecRegSize / Sz; + + if (!isPowerOf2_32(Sz) || VF < 2) + return false; + + // Keep track of values that were delete by vectorizing in the loop below. + SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end()); + + bool Changed = false; + // Look for profitable vectorizable trees at all offsets, starting at zero. + for (unsigned i = 0, e = ChainLen; i < e; ++i) { + if (i + VF > e) + break; + + // Check that a previous iteration of this loop did not delete the Value. + if (hasValueBeenRAUWed(Chain, TrackValues, i, VF)) + continue; + + DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i + << "\n"); + ArrayRef<Value *> Operands = Chain.slice(i, VF); + + R.buildTree(Operands); + + int Cost = R.getTreeCost(); + + DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); + if (Cost < CostThreshold) { + DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + R.vectorizeTree(); + + // Move to the next bundle. + i += VF - 1; + Changed = true; + } + } + + return Changed; +} + +bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, + int costThreshold, BoUpSLP &R) { + SetVector<Value *> Heads, Tails; + SmallDenseMap<Value *, Value *> ConsecutiveChain; + + // We may run into multiple chains that merge into a single chain. We mark the + // stores that we vectorized so that we don't visit the same store twice. + BoUpSLP::ValueSet VectorizedStores; + bool Changed = false; + + // Do a quadratic search on all of the given stores and find + // all of the pairs of stores that follow each other. + for (unsigned i = 0, e = Stores.size(); i < e; ++i) { + for (unsigned j = 0; j < e; ++j) { + if (i == j) + continue; + + if (R.isConsecutiveAccess(Stores[i], Stores[j])) { + Tails.insert(Stores[j]); + Heads.insert(Stores[i]); + ConsecutiveChain[Stores[i]] = Stores[j]; + } + } + } + + // For stores that start but don't end a link in the chain: + for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end(); + it != e; ++it) { + if (Tails.count(*it)) + continue; + + // We found a store instr that starts a chain. Now follow the chain and try + // to vectorize it. + BoUpSLP::ValueList Operands; + Value *I = *it; + // Collect the chain into a list. + while (Tails.count(I) || Heads.count(I)) { + if (VectorizedStores.count(I)) + break; + Operands.push_back(I); + // Move to the next value in the chain. + I = ConsecutiveChain[I]; + } + + bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R); + + // Mark the vectorized stores so that we don't vectorize them again. + if (Vectorized) + VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed |= Vectorized; + } + + return Changed; +} + + unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { unsigned count = 0; StoreRefs.clear(); @@ -156,15 +1993,17 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { if (!SI) continue; + // Don't touch volatile stores. + if (!SI->isSimple()) + continue; + // Check that the pointer points to scalars. Type *Ty = SI->getValueOperand()->getType(); if (Ty->isAggregateType() || Ty->isVectorTy()) return 0; - // Find the base of the GEP. - Value *Ptr = SI->getPointerOperand(); - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) - Ptr = GEP->getPointerOperand(); + // Find the base pointer. + Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL); // Save the store locations. StoreRefs[Ptr].push_back(SI); @@ -173,34 +2012,83 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { return count; } -bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { - if (!A || !B) return false; +bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { + if (!A || !B) + return false; Value *VL[] = { A, B }; return tryToVectorizeList(VL, R); } bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) { - DEBUG(dbgs()<<"SLP: Vectorizing a list of length = " << VL.size() << ".\n"); + if (VL.size() < 2) + return false; + + DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n"); + + // Check that all of the parts are scalar instructions of the same type. + Instruction *I0 = dyn_cast<Instruction>(VL[0]); + if (!I0) + return false; + + unsigned Opcode0 = I0->getOpcode(); + + Type *Ty0 = I0->getType(); + unsigned Sz = DL->getTypeSizeInBits(Ty0); + unsigned VF = MinVecRegSize / Sz; - // Check that all of the parts are scalar. for (int i = 0, e = VL.size(); i < e; ++i) { Type *Ty = VL[i]->getType(); if (Ty->isAggregateType() || Ty->isVectorTy()) - return 0; + return false; + Instruction *Inst = dyn_cast<Instruction>(VL[i]); + if (!Inst || Inst->getOpcode() != Opcode0) + return false; } - int Cost = R.getTreeCost(VL); - int ExtrCost = R.getScalarizationCost(VL); - DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << - " Cost of extract:" << ExtrCost << ".\n"); - if ((Cost+ExtrCost) >= -SLPCostThreshold) return false; - DEBUG(dbgs()<<"SLP: Vectorizing pair.\n"); - R.vectorizeArith(VL); - return true; + bool Changed = false; + + // Keep track of values that were delete by vectorizing in the loop below. + SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end()); + + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + unsigned OpsWidth = 0; + + if (i + VF > e) + OpsWidth = e - i; + else + OpsWidth = VF; + + if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) + break; + + // Check that a previous iteration of this loop did not delete the Value. + if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth)) + continue; + + DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " + << "\n"); + ArrayRef<Value *> Ops = VL.slice(i, OpsWidth); + + R.buildTree(Ops); + int Cost = R.getTreeCost(); + + if (Cost < -SLPCostThreshold) { + DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n"); + R.vectorizeTree(); + + // Move to the next bundle. + i += VF - 1; + Changed = true; + } + } + + return Changed; } -bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { - if (!V) return false; +bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { + if (!V) + return false; + // Try to vectorize V. if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) return true; @@ -237,38 +2125,502 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { return 0; } -bool SLPVectorizer::vectorizeReductions(BasicBlock *BB, BoUpSLP &R) { +/// \brief Generate a shuffle mask to be used in a reduction tree. +/// +/// \param VecLen The length of the vector to be reduced. +/// \param NumEltsToRdx The number of elements that should be reduced in the +/// vector. +/// \param IsPairwise Whether the reduction is a pairwise or splitting +/// reduction. A pairwise reduction will generate a mask of +/// <0,2,...> or <1,3,..> while a splitting reduction will generate +/// <2,3, undef,undef> for a vector of 4 and NumElts = 2. +/// \param IsLeft True will generate a mask of even elements, odd otherwise. +static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, + bool IsPairwise, bool IsLeft, + IRBuilder<> &Builder) { + assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"); + + SmallVector<Constant *, 32> ShuffleMask( + VecLen, UndefValue::get(Builder.getInt32Ty())); + + if (IsPairwise) + // Build a mask of 0, 2, ... (left) or 1, 3, ... (right). + for (unsigned i = 0; i != NumEltsToRdx; ++i) + ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft); + else + // Move the upper half of the vector to the lower half. + for (unsigned i = 0; i != NumEltsToRdx; ++i) + ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i); + + return ConstantVector::get(ShuffleMask); +} + + +/// Model horizontal reductions. +/// +/// A horizontal reduction is a tree of reduction operations (currently add and +/// fadd) that has operations that can be put into a vector as its leaf. +/// For example, this tree: +/// +/// mul mul mul mul +/// \ / \ / +/// + + +/// \ / +/// + +/// This tree has "mul" as its reduced values and "+" as its reduction +/// operations. A reduction might be feeding into a store or a binary operation +/// feeding a phi. +/// ... +/// \ / +/// + +/// | +/// phi += +/// +/// Or: +/// ... +/// \ / +/// + +/// | +/// *p = +/// +class HorizontalReduction { + SmallPtrSet<Value *, 16> ReductionOps; + SmallVector<Value *, 32> ReducedVals; + + BinaryOperator *ReductionRoot; + PHINode *ReductionPHI; + + /// The opcode of the reduction. + unsigned ReductionOpcode; + /// The opcode of the values we perform a reduction on. + unsigned ReducedValueOpcode; + /// The width of one full horizontal reduction operation. + unsigned ReduxWidth; + /// Should we model this reduction as a pairwise reduction tree or a tree that + /// splits the vector in halves and adds those halves. + bool IsPairwiseReduction; + +public: + HorizontalReduction() + : ReductionRoot(0), ReductionPHI(0), ReductionOpcode(0), + ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} + + /// \brief Try to find a reduction tree. + bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, + DataLayout *DL) { + assert((!Phi || + std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && + "Thi phi needs to use the binary operator"); + + // We could have a initial reductions that is not an add. + // r *= v1 + v2 + v3 + v4 + // In such a case start looking for a tree rooted in the first '+'. + if (Phi) { + if (B->getOperand(0) == Phi) { + Phi = 0; + B = dyn_cast<BinaryOperator>(B->getOperand(1)); + } else if (B->getOperand(1) == Phi) { + Phi = 0; + B = dyn_cast<BinaryOperator>(B->getOperand(0)); + } + } + + if (!B) + return false; + + Type *Ty = B->getType(); + if (Ty->isVectorTy()) + return false; + + ReductionOpcode = B->getOpcode(); + ReducedValueOpcode = 0; + ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty); + ReductionRoot = B; + ReductionPHI = Phi; + + if (ReduxWidth < 4) + return false; + + // We currently only support adds. + if (ReductionOpcode != Instruction::Add && + ReductionOpcode != Instruction::FAdd) + return false; + + // Post order traverse the reduction tree starting at B. We only handle true + // trees containing only binary operators. + SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack; + Stack.push_back(std::make_pair(B, 0)); + while (!Stack.empty()) { + BinaryOperator *TreeN = Stack.back().first; + unsigned EdgeToVist = Stack.back().second++; + bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; + + // Only handle trees in the current basic block. + if (TreeN->getParent() != B->getParent()) + return false; + + // Each tree node needs to have one user except for the ultimate + // reduction. + if (!TreeN->hasOneUse() && TreeN != B) + return false; + + // Postorder vist. + if (EdgeToVist == 2 || IsReducedValue) { + if (IsReducedValue) { + // Make sure that the opcodes of the operations that we are going to + // reduce match. + if (!ReducedValueOpcode) + ReducedValueOpcode = TreeN->getOpcode(); + else if (ReducedValueOpcode != TreeN->getOpcode()) + return false; + ReducedVals.push_back(TreeN); + } else { + // We need to be able to reassociate the adds. + if (!TreeN->isAssociative()) + return false; + ReductionOps.insert(TreeN); + } + // Retract. + Stack.pop_back(); + continue; + } + + // Visit left or right. + Value *NextV = TreeN->getOperand(EdgeToVist); + BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV); + if (Next) + Stack.push_back(std::make_pair(Next, 0)); + else if (NextV != Phi) + return false; + } + return true; + } + + /// \brief Attempt to vectorize the tree found by + /// matchAssociativeReduction. + bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { + if (ReducedVals.empty()) + return false; + + unsigned NumReducedVals = ReducedVals.size(); + if (NumReducedVals < ReduxWidth) + return false; + + Value *VectorizedTree = 0; + IRBuilder<> Builder(ReductionRoot); + FastMathFlags Unsafe; + Unsafe.setUnsafeAlgebra(); + Builder.SetFastMathFlags(Unsafe); + unsigned i = 0; + + for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { + ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth); + V.buildTree(ValsToReduce, &ReductionOps); + + // Estimate cost. + int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); + if (Cost >= -SLPCostThreshold) + break; + + DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost + << ". (HorRdx)\n"); + + // Vectorize a tree. + DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); + Value *VectorizedRoot = V.vectorizeTree(); + + // Emit a reduction. + Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder); + if (VectorizedTree) { + Builder.SetCurrentDebugLocation(Loc); + VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, + ReducedSubTree, "bin.rdx"); + } else + VectorizedTree = ReducedSubTree; + } + + if (VectorizedTree) { + // Finish the reduction. + for (; i < NumReducedVals; ++i) { + Builder.SetCurrentDebugLocation( + cast<Instruction>(ReducedVals[i])->getDebugLoc()); + VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, + ReducedVals[i]); + } + // Update users. + if (ReductionPHI) { + assert(ReductionRoot != NULL && "Need a reduction operation"); + ReductionRoot->setOperand(0, VectorizedTree); + ReductionRoot->setOperand(1, ReductionPHI); + } else + ReductionRoot->replaceAllUsesWith(VectorizedTree); + } + return VectorizedTree != 0; + } + +private: + + /// \brief Calcuate the cost of a reduction. + int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { + Type *ScalarTy = FirstReducedVal->getType(); + Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); + + int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); + int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); + + IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; + int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; + + int ScalarReduxCost = + ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy); + + DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost + << " for reduction that starts with " << *FirstReducedVal + << " (It is a " + << (IsPairwiseReduction ? "pairwise" : "splitting") + << " reduction)\n"); + + return VecReduxCost - ScalarReduxCost; + } + + static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L, + Value *R, const Twine &Name = "") { + if (Opcode == Instruction::FAdd) + return Builder.CreateFAdd(L, R, Name); + return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); + } + + /// \brief Emit a horizontal reduction of the vectorized value. + Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { + assert(VectorizedValue && "Need to have a vectorized tree node"); + Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue); + assert(isPowerOf2_32(ReduxWidth) && + "We only handle power-of-two reductions for now"); + + Value *TmpVec = ValToReduce; + for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { + if (IsPairwiseReduction) { + Value *LeftMask = + createRdxShuffleMask(ReduxWidth, i, true, true, Builder); + Value *RightMask = + createRdxShuffleMask(ReduxWidth, i, true, false, Builder); + + Value *LeftShuf = Builder.CreateShuffleVector( + TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); + Value *RightShuf = Builder.CreateShuffleVector( + TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), + "rdx.shuf.r"); + TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf, + "bin.rdx"); + } else { + Value *UpperHalf = + createRdxShuffleMask(ReduxWidth, i, false, false, Builder); + Value *Shuf = Builder.CreateShuffleVector( + TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); + TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx"); + } + } + + // The result is in the first element of the vector. + return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + } +}; + +/// \brief Recognize construction of vectors like +/// %ra = insertelement <4 x float> undef, float %s0, i32 0 +/// %rb = insertelement <4 x float> %ra, float %s1, i32 1 +/// %rc = insertelement <4 x float> %rb, float %s2, i32 2 +/// %rd = insertelement <4 x float> %rc, float %s3, i32 3 +/// +/// Returns true if it matches +/// +static bool findBuildVector(InsertElementInst *IE, + SmallVectorImpl<Value *> &Ops) { + if (!isa<UndefValue>(IE->getOperand(0))) + return false; + + while (true) { + Ops.push_back(IE->getOperand(1)); + + if (IE->use_empty()) + return false; + + InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->use_back()); + if (!NextUse) + return true; + + // If this isn't the final use, make sure the next insertelement is the only + // use. It's OK if the final constructed vector is used multiple times + if (!IE->hasOneUse()) + return false; + + IE = NextUse; + } + + return false; +} + +static bool PhiTypeSorterFunc(Value *V, Value *V2) { + return V->getType() < V2->getType(); +} + +bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - if (isa<DbgInfoIntrinsic>(it)) continue; + SmallVector<Value *, 4> Incoming; + SmallSet<Value *, 16> VisitedInstrs; + + bool HaveVectorizedPhiNodes = true; + while (HaveVectorizedPhiNodes) { + HaveVectorizedPhiNodes = false; + + // Collect the incoming values from the PHIs. + Incoming.clear(); + for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie; + ++instr) { + PHINode *P = dyn_cast<PHINode>(instr); + if (!P) + break; + + if (!VisitedInstrs.count(P)) + Incoming.push_back(P); + } + + // Sort by type. + std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc); + + // Try to vectorize elements base on their type. + for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(), + E = Incoming.end(); + IncIt != E;) { + + // Look for the next elements with the same type. + SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; + while (SameTypeIt != E && + (*SameTypeIt)->getType() == (*IncIt)->getType()) { + VisitedInstrs.insert(*SameTypeIt); + ++SameTypeIt; + } + + // Try to vectorize them. + unsigned NumElts = (SameTypeIt - IncIt); + DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n"); + if (NumElts > 1 && + tryToVectorizeList(ArrayRef<Value *>(IncIt, NumElts), R)) { + // Success start over because instructions might have been changed. + HaveVectorizedPhiNodes = true; + Changed = true; + break; + } + + // Start over at the next instruction of a differnt type (or the end). + IncIt = SameTypeIt; + } + } + + VisitedInstrs.clear(); + + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { + // We may go through BB multiple times so skip the one we have checked. + if (!VisitedInstrs.insert(it)) + continue; + + if (isa<DbgInfoIntrinsic>(it)) + continue; // Try to vectorize reductions that use PHINodes. if (PHINode *P = dyn_cast<PHINode>(it)) { // Check that the PHI is a reduction PHI. - if (P->getNumIncomingValues() != 2) return Changed; - Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) : - (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : - 0)); + if (P->getNumIncomingValues() != 2) + return Changed; + Value *Rdx = + (P->getIncomingBlock(0) == BB + ? (P->getIncomingValue(0)) + : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : 0)); // Check if this is a Binary Operator. BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); if (!BI) continue; - Value *Inst = BI->getOperand(0); - if (Inst == P) Inst = BI->getOperand(1); - Changed |= tryToVectorize(dyn_cast<BinaryOperator>(Inst), R); + // Try to match and vectorize a horizontal reduction. + HorizontalReduction HorRdx; + if (ShouldVectorizeHor && + HorRdx.matchAssociativeReduction(P, BI, DL) && + HorRdx.tryToReduce(R, TTI)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; + } + + Value *Inst = BI->getOperand(0); + if (Inst == P) + Inst = BI->getOperand(1); + + if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) { + // We would like to start over since some instructions are deleted + // and the iterator may become invalid value. + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; + } + continue; } + // Try to vectorize horizontal reductions feeding into a store. + if (ShouldStartVectorizeHorAtStore) + if (StoreInst *SI = dyn_cast<StoreInst>(it)) + if (BinaryOperator *BinOp = + dyn_cast<BinaryOperator>(SI->getValueOperand())) { + HorizontalReduction HorRdx; + if (((HorRdx.matchAssociativeReduction(0, BinOp, DL) && + HorRdx.tryToReduce(R, TTI)) || + tryToVectorize(BinOp, R))) { + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; + } + } + // Try to vectorize trees that start at compare instructions. if (CmpInst *CI = dyn_cast<CmpInst>(it)) { if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { - Changed |= true; + Changed = true; + // We would like to start over since some instructions are deleted + // and the iterator may become invalid value. + it = BB->begin(); + e = BB->end(); continue; } - for (int i = 0; i < 2; ++i) - if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) - Changed |= tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R); + + for (int i = 0; i < 2; ++i) { + if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) { + if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) { + Changed = true; + // We would like to start over since some instructions are deleted + // and the iterator may become invalid value. + it = BB->begin(); + e = BB->end(); + } + } + } + continue; + } + + // Try to vectorize trees that start at insertelement instructions. + if (InsertElementInst *IE = dyn_cast<InsertElementInst>(it)) { + SmallVector<Value *, 8> Ops; + if (!findBuildVector(IE, Ops)) + continue; + + if (tryToVectorizeList(Ops, R)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + } + continue; } } @@ -284,51 +2636,19 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { if (it->second.size() < 2) continue; - DEBUG(dbgs()<<"SLP: Analyzing a store chain of length " << - it->second.size() << ".\n"); + DEBUG(dbgs() << "SLP: Analyzing a store chain of length " + << it->second.size() << ".\n"); - Changed |= R.vectorizeStores(it->second, -SLPCostThreshold); + // Process the stores in chunks of 16. + for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { + unsigned Len = std::min<unsigned>(CE - CI, 16); + ArrayRef<StoreInst *> Chunk(&it->second[CI], Len); + Changed |= vectorizeStores(Chunk, -SLPCostThreshold, R); + } } return Changed; } -void SLPVectorizer::hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, - BoUpSLP &R) { - // Check if this block is inside a loop. - Loop *L = LI->getLoopFor(BB); - if (!L) - return; - - // Check if it has a preheader. - BasicBlock *PreHeader = L->getLoopPreheader(); - if (!PreHeader) - return; - - // Mark the insertion point for the block. - Instruction *Location = PreHeader->getTerminator(); - - BoUpSLP::ValueList &Gathers = R.getGatherSeqInstructions(); - for (BoUpSLP::ValueList::iterator it = Gathers.begin(), e = Gathers.end(); - it != e; ++it) { - InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it); - - // The InsertElement sequence can be simplified into a constant. - if (!Insert) - continue; - - // If the vector or the element that we insert into it are - // instructions that are defined in this basic block then we can't - // hoist this instruction. - Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0)); - Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1)); - if (CurrVec && L->contains(CurrVec)) continue; - if (NewElem && L->contains(NewElem)) continue; - - // We can hoist this instruction. Move it to the pre-header. - Insert->moveBefore(Location); - } -} - } // end anonymous namespace char SLPVectorizer::ID = 0; @@ -341,8 +2661,5 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) namespace llvm { - Pass *createSLPVectorizerPass() { - return new SLPVectorizer(); - } +Pass *createSLPVectorizerPass() { return new SLPVectorizer(); } } - diff --git a/contrib/llvm/lib/Transforms/Vectorize/VecUtils.cpp b/contrib/llvm/lib/Transforms/Vectorize/VecUtils.cpp deleted file mode 100644 index 9b94366..0000000 --- a/contrib/llvm/lib/Transforms/Vectorize/VecUtils.cpp +++ /dev/null @@ -1,730 +0,0 @@ -//===- VecUtils.cpp --- Vectorization Utilities ---------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -#define DEBUG_TYPE "SLP" - -#include "VecUtils.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/Verifier.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/Value.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" -#include <algorithm> -#include <map> - -using namespace llvm; - -static const unsigned MinVecRegSize = 128; - -static const unsigned RecursionMaxDepth = 6; - -namespace llvm { - -BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl, - TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp) : - BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp) { - numberInstructions(); -} - -void BoUpSLP::numberInstructions() { - int Loc = 0; - InstrIdx.clear(); - InstrVec.clear(); - // Number the instructions in the block. - for (BasicBlock::iterator it=BB->begin(), e=BB->end(); it != e; ++it) { - InstrIdx[it] = Loc++; - InstrVec.push_back(it); - assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation"); - } -} - -Value *BoUpSLP::getPointerOperand(Value *I) { - if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand(); - if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand(); - return 0; -} - -unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { - if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace(); - if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace(); - return -1; -} - -bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { - Value *PtrA = getPointerOperand(A); - Value *PtrB = getPointerOperand(B); - unsigned ASA = getAddressSpaceOperand(A); - unsigned ASB = getAddressSpaceOperand(B); - - // Check that the address spaces match and that the pointers are valid. - if (!PtrA || !PtrB || (ASA != ASB)) return false; - - // Check that A and B are of the same type. - if (PtrA->getType() != PtrB->getType()) return false; - - // Calculate the distance. - const SCEV *PtrSCEVA = SE->getSCEV(PtrA); - const SCEV *PtrSCEVB = SE->getSCEV(PtrB); - const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB); - const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV); - - // Non constant distance. - if (!ConstOffSCEV) return false; - - int64_t Offset = ConstOffSCEV->getValue()->getSExtValue(); - Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); - // The Instructions are connsecutive if the size of the first load/store is - // the same as the offset. - int64_t Sz = DL->getTypeStoreSize(Ty); - return ((-Offset) == Sz); -} - -bool BoUpSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) { - Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); - unsigned Sz = DL->getTypeSizeInBits(StoreTy); - unsigned VF = MinVecRegSize / Sz; - - if (!isPowerOf2_32(Sz) || VF < 2) return false; - - bool Changed = false; - // Look for profitable vectorizable trees at all offsets, starting at zero. - for (unsigned i = 0, e = Chain.size(); i < e; ++i) { - if (i + VF > e) return Changed; - DEBUG(dbgs()<<"SLP: Analyzing " << VF << " stores at offset "<< i << "\n"); - ArrayRef<Value *> Operands = Chain.slice(i, VF); - - int Cost = getTreeCost(Operands); - DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); - if (Cost < CostThreshold) { - DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); - vectorizeTree(Operands, VF); - i += VF - 1; - Changed = true; - } - } - - return Changed; -} - -bool BoUpSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) { - ValueSet Heads, Tails; - SmallDenseMap<Value*, Value*> ConsecutiveChain; - - // We may run into multiple chains that merge into a single chain. We mark the - // stores that we vectorized so that we don't visit the same store twice. - ValueSet VectorizedStores; - bool Changed = false; - - // Do a quadratic search on all of the given stores and find - // all of the pairs of loads that follow each other. - for (unsigned i = 0, e = Stores.size(); i < e; ++i) - for (unsigned j = 0; j < e; ++j) { - if (i == j) continue; - if (isConsecutiveAccess(Stores[i], Stores[j])) { - Tails.insert(Stores[j]); - Heads.insert(Stores[i]); - ConsecutiveChain[Stores[i]] = Stores[j]; - } - } - - // For stores that start but don't end a link in the chain: - for (ValueSet::iterator it = Heads.begin(), e = Heads.end();it != e; ++it) { - if (Tails.count(*it)) continue; - - // We found a store instr that starts a chain. Now follow the chain and try - // to vectorize it. - ValueList Operands; - Value *I = *it; - // Collect the chain into a list. - while (Tails.count(I) || Heads.count(I)) { - if (VectorizedStores.count(I)) break; - Operands.push_back(I); - // Move to the next value in the chain. - I = ConsecutiveChain[I]; - } - - bool Vectorized = vectorizeStoreChain(Operands, costThreshold); - - // Mark the vectorized stores so that we don't vectorize them again. - if (Vectorized) - VectorizedStores.insert(Operands.begin(), Operands.end()); - Changed |= Vectorized; - } - - return Changed; -} - -int BoUpSLP::getScalarizationCost(ArrayRef<Value *> VL) { - // Find the type of the operands in VL. - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - // Find the cost of inserting/extracting values from the vector. - return getScalarizationCost(VecTy); -} - -int BoUpSLP::getScalarizationCost(Type *Ty) { - int Cost = 0; - for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) - Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); - return Cost; -} - -AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) { - if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI); - if (LoadInst *LI = dyn_cast<LoadInst>(I)) return AA->getLocation(LI); - return AliasAnalysis::Location(); -} - -Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) { - assert(Src->getParent() == Dst->getParent() && "Not the same BB"); - BasicBlock::iterator I = Src, E = Dst; - /// Scan all of the instruction from SRC to DST and check if - /// the source may alias. - for (++I; I != E; ++I) { - // Ignore store instructions that are marked as 'ignore'. - if (MemBarrierIgnoreList.count(I)) continue; - if (Src->mayWriteToMemory()) /* Write */ { - if (!I->mayReadOrWriteMemory()) continue; - } else /* Read */ { - if (!I->mayWriteToMemory()) continue; - } - AliasAnalysis::Location A = getLocation(&*I); - AliasAnalysis::Location B = getLocation(Src); - - if (!A.Ptr || !B.Ptr || AA->alias(A, B)) - return I; - } - return 0; -} - -void BoUpSLP::vectorizeArith(ArrayRef<Value *> Operands) { - Value *Vec = vectorizeTree(Operands, Operands.size()); - BasicBlock::iterator Loc = cast<Instruction>(Vec); - IRBuilder<> Builder(++Loc); - // After vectorizing the operands we need to generate extractelement - // instructions and replace all of the uses of the scalar values with - // the values that we extracted from the vectorized tree. - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i)); - Operands[i]->replaceAllUsesWith(S); - } -} - -int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) { - // Get rid of the list of stores that were removed, and from the - // lists of instructions with multiple users. - MemBarrierIgnoreList.clear(); - LaneMap.clear(); - MultiUserVals.clear(); - MustScalarize.clear(); - - // Scan the tree and find which value is used by which lane, and which values - // must be scalarized. - getTreeUses_rec(VL, 0); - - // Check that instructions with multiple users can be vectorized. Mark unsafe - // instructions. - for (ValueSet::iterator it = MultiUserVals.begin(), - e = MultiUserVals.end(); it != e; ++it) { - // Check that all of the users of this instr are within the tree - // and that they are all from the same lane. - int Lane = -1; - for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end(); - I != E; ++I) { - if (LaneMap.find(*I) == LaneMap.end()) { - MustScalarize.insert(*it); - DEBUG(dbgs()<<"SLP: Adding " << **it << - " to MustScalarize because of an out of tree usage.\n"); - break; - } - if (Lane == -1) Lane = LaneMap[*I]; - if (Lane != LaneMap[*I]) { - MustScalarize.insert(*it); - DEBUG(dbgs()<<"Adding " << **it << - " to MustScalarize because multiple lane use it: " - << Lane << " and " << LaneMap[*I] << ".\n"); - break; - } - } - } - - // Now calculate the cost of vectorizing the tree. - return getTreeCost_rec(VL, 0); -} - -void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) { - if (Depth == RecursionMaxDepth) return; - - // Don't handle vectors. - if (VL[0]->getType()->isVectorTy()) return; - if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) - if (SI->getValueOperand()->getType()->isVectorTy()) return; - - // Check if all of the operands are constants. - bool AllConst = true; - bool AllSameScalar = true; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - AllConst &= isa<Constant>(VL[i]); - AllSameScalar &= (VL[0] == VL[i]); - Instruction *I = dyn_cast<Instruction>(VL[i]); - // If one of the instructions is out of this BB, we need to scalarize all. - if (I && I->getParent() != BB) return; - } - - // If all of the operands are identical or constant we have a simple solution. - if (AllConst || AllSameScalar) return; - - // Scalarize unknown structures. - Instruction *VL0 = dyn_cast<Instruction>(VL[0]); - if (!VL0) return; - - unsigned Opcode = VL0->getOpcode(); - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *I = dyn_cast<Instruction>(VL[i]); - // If not all of the instructions are identical then we have to scalarize. - if (!I || Opcode != I->getOpcode()) return; - } - - // Mark instructions with multiple users. - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *I = dyn_cast<Instruction>(VL[i]); - // Remember to check if all of the users of this instr are vectorized - // within our tree. - if (I && I->getNumUses() > 1) MultiUserVals.insert(I); - } - - for (int i = 0, e = VL.size(); i < e; ++i) { - // Check that the instruction is only used within - // one lane. - if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) return; - // Make this instruction as 'seen' and remember the lane. - LaneMap[VL[i]] = i; - } - - switch (Opcode) { - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); - - getTreeUses_rec(Operands, Depth+1); - } - return; - } - case Instruction::Store: { - ValueList Operands; - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); - getTreeUses_rec(Operands, Depth+1); - return; - } - default: - return; - } -} - -int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) { - Type *ScalarTy = VL[0]->getType(); - - if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - - /// Don't mess with vectors. - if (ScalarTy->isVectorTy()) return max_cost; - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - - if (Depth == RecursionMaxDepth) return getScalarizationCost(VecTy); - - // Check if all of the operands are constants. - bool AllConst = true; - bool AllSameScalar = true; - bool MustScalarizeFlag = false; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - AllConst &= isa<Constant>(VL[i]); - AllSameScalar &= (VL[0] == VL[i]); - // Must have a single use. - Instruction *I = dyn_cast<Instruction>(VL[i]); - MustScalarizeFlag |= MustScalarize.count(VL[i]); - // This instruction is outside the basic block. - if (I && I->getParent() != BB) - return getScalarizationCost(VecTy); - } - - // Is this a simple vector constant. - if (AllConst) return 0; - - // If all of the operands are identical we can broadcast them. - Instruction *VL0 = dyn_cast<Instruction>(VL[0]); - if (AllSameScalar) { - // If we are in a loop, and this is not an instruction (e.g. constant or - // argument) or the instruction is defined outside the loop then assume - // that the cost is zero. - if (L && (!VL0 || !L->contains(VL0))) - return 0; - - // We need to broadcast the scalar. - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); - } - - // If this is not a constant, or a scalar from outside the loop then we - // need to scalarize it. - if (MustScalarizeFlag) - return getScalarizationCost(VecTy); - - if (!VL0) return getScalarizationCost(VecTy); - assert(VL0->getParent() == BB && "Wrong BB"); - - unsigned Opcode = VL0->getOpcode(); - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *I = dyn_cast<Instruction>(VL[i]); - // If not all of the instructions are identical then we have to scalarize. - if (!I || Opcode != I->getOpcode()) return getScalarizationCost(VecTy); - } - - // Check if it is safe to sink the loads or the stores. - if (Opcode == Instruction::Load || Opcode == Instruction::Store) { - int MaxIdx = InstrIdx[VL0]; - for (unsigned i = 1, e = VL.size(); i < e; ++i ) - MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); - - Instruction *Last = InstrVec[MaxIdx]; - for (unsigned i = 0, e = VL.size(); i < e; ++i ) { - if (VL[i] == Last) continue; - Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last); - if (Barrier) { - DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << - *Last << "\n because of " << *Barrier << "\n"); - return max_cost; - } - } - } - - switch (Opcode) { - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - int Cost = 0; - ValueList Operands; - Type *SrcTy = VL0->getOperand(0)->getType(); - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) { - Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); - // Check that the casted type is the same for all users. - if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy) - return getScalarizationCost(VecTy); - } - - Cost += getTreeCost_rec(Operands, Depth+1); - if (Cost >= max_cost) return max_cost; - - // Calculate the cost of this instruction. - int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), - VL0->getType(), SrcTy); - - VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); - int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); - Cost += (VecCost - ScalarCost); - return Cost; - } - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - int Cost = 0; - // Calculate the cost of all of the operands. - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); - - Cost += getTreeCost_rec(Operands, Depth+1); - if (Cost >= max_cost) return max_cost; - } - - // Calculate the cost of this instruction. - int ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy); - - int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); - Cost += (VecCost - ScalarCost); - return Cost; - } - case Instruction::Load: { - // If we are scalarize the loads, add the cost of forming the vector. - for (unsigned i = 0, e = VL.size()-1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i+1])) - return getScalarizationCost(VecTy); - - // Cost of wide load - cost of scalar loads. - int ScalarLdCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - return VecLdCost - ScalarLdCost; - } - case Instruction::Store: { - // We know that we can merge the stores. Calculate the cost. - int ScalarStCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,0); - int StoreCost = VecStCost - ScalarStCost; - - ValueList Operands; - for (unsigned j = 0; j < VL.size(); ++j) { - Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); - MemBarrierIgnoreList.insert(VL[j]); - } - - int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1); - return TotalCost; - } - default: - // Unable to vectorize unknown instructions. - return getScalarizationCost(VecTy); - } -} - -Instruction *BoUpSLP::GetLastInstr(ArrayRef<Value *> VL, unsigned VF) { - int MaxIdx = InstrIdx[BB->getFirstNonPHI()]; - for (unsigned i = 0; i < VF; ++i ) - MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); - return InstrVec[MaxIdx + 1]; -} - -Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) { - IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements())); - Value *Vec = UndefValue::get(Ty); - for (unsigned i=0; i < Ty->getNumElements(); ++i) { - // Generate the 'InsertElement' instruction. - Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); - // Remember that this instruction is used as part of a 'gather' sequence. - // The caller of the bottom-up slp vectorizer can try to hoist the sequence - // if the users are outside of the basic block. - GatherInstructions.push_back(Vec); - } - - return Vec; -} - -Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) { - Value *V = vectorizeTree_rec(VL, VF); - // We moved some instructions around. We have to number them again - // before we can do any analysis. - numberInstructions(); - MustScalarize.clear(); - return V; -} - -Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) { - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VF); - - // Check if all of the operands are constants or identical. - bool AllConst = true; - bool AllSameScalar = true; - for (unsigned i = 0, e = VF; i < e; ++i) { - AllConst &= isa<Constant>(VL[i]); - AllSameScalar &= (VL[0] == VL[i]); - // The instruction must be in the same BB, and it must be vectorizable. - Instruction *I = dyn_cast<Instruction>(VL[i]); - if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB)) - return Scalarize(VL, VecTy); - } - - // Check that this is a simple vector constant. - if (AllConst || AllSameScalar) return Scalarize(VL, VecTy); - - // Scalarize unknown structures. - Instruction *VL0 = dyn_cast<Instruction>(VL[0]); - if (!VL0) return Scalarize(VL, VecTy); - - if (VectorizedValues.count(VL0)) return VectorizedValues[VL0]; - - unsigned Opcode = VL0->getOpcode(); - for (unsigned i = 0, e = VF; i < e; ++i) { - Instruction *I = dyn_cast<Instruction>(VL[i]); - // If not all of the instructions are identical then we have to scalarize. - if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy); - } - - switch (Opcode) { - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - ValueList INVL; - for (int i = 0; i < VF; ++i) - INVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); - Value *InVec = vectorizeTree_rec(INVL, VF); - IRBuilder<> Builder(GetLastInstr(VL, VF)); - CastInst *CI = dyn_cast<CastInst>(VL0); - Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - ValueList LHSVL, RHSVL; - for (int i = 0; i < VF; ++i) { - RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); - LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1)); - } - - Value *RHS = vectorizeTree_rec(RHSVL, VF); - Value *LHS = vectorizeTree_rec(LHSVL, VF); - IRBuilder<> Builder(GetLastInstr(VL, VF)); - BinaryOperator *BinOp = cast<BinaryOperator>(VL0); - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::Load: { - LoadInst *LI = cast<LoadInst>(VL0); - unsigned Alignment = LI->getAlignment(); - - // Check if all of the loads are consecutive. - for (unsigned i = 1, e = VF; i < e; ++i) - if (!isConsecutiveAccess(VL[i-1], VL[i])) - return Scalarize(VL, VecTy); - - IRBuilder<> Builder(GetLastInstr(VL, VF)); - Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), - VecTy->getPointerTo()); - LI = Builder.CreateLoad(VecPtr); - LI->setAlignment(Alignment); - VectorizedValues[VL0] = LI; - return LI; - } - case Instruction::Store: { - StoreInst *SI = cast<StoreInst>(VL0); - unsigned Alignment = SI->getAlignment(); - - ValueList ValueOp; - for (int i = 0; i < VF; ++i) - ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand()); - - Value *VecValue = vectorizeTree_rec(ValueOp, VF); - - IRBuilder<> Builder(GetLastInstr(VL, VF)); - Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), - VecTy->getPointerTo()); - Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); - - for (int i = 0; i < VF; ++i) - cast<Instruction>(VL[i])->eraseFromParent(); - return 0; - } - default: - Value *S = Scalarize(VL, VecTy); - VectorizedValues[VL0] = S; - return S; - } -} - -} // end of namespace diff --git a/contrib/llvm/lib/Transforms/Vectorize/VecUtils.h b/contrib/llvm/lib/Transforms/Vectorize/VecUtils.h deleted file mode 100644 index 5456c6c..0000000 --- a/contrib/llvm/lib/Transforms/Vectorize/VecUtils.h +++ /dev/null @@ -1,164 +0,0 @@ -//===- VecUtils.h - Vectorization Utilities -------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This family of classes and functions manipulate vectors and chains of -// vectors. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H -#define LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include <vector> - -namespace llvm { - -class BasicBlock; class Instruction; class Type; -class VectorType; class StoreInst; class Value; -class ScalarEvolution; class DataLayout; -class TargetTransformInfo; class AliasAnalysis; -class Loop; - -/// Bottom Up SLP vectorization utility class. -struct BoUpSLP { - typedef SmallVector<Value*, 8> ValueList; - typedef SmallPtrSet<Value*, 16> ValueSet; - typedef SmallVector<StoreInst*, 8> StoreList; - static const int max_cost = 1<<20; - - // \brief C'tor. - BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl, - TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp); - - /// \brief Take the pointer operand from the Load/Store instruction. - /// \returns NULL if this is not a valid Load/Store instruction. - static Value *getPointerOperand(Value *I); - - /// \brief Take the address space operand from the Load/Store instruction. - /// \returns -1 if this is not a valid Load/Store instruction. - static unsigned getAddressSpaceOperand(Value *I); - - /// \returns true if the memory operations A and B are consecutive. - bool isConsecutiveAccess(Value *A, Value *B); - - /// \brief Vectorize the tree that starts with the elements in \p VL. - /// \returns the vectorized value. - Value *vectorizeTree(ArrayRef<Value *> VL, int VF); - - /// \returns the vectorization cost of the subtree that starts at \p VL. - /// A negative number means that this is profitable. - int getTreeCost(ArrayRef<Value *> VL); - - /// \returns the scalarization cost for this list of values. Assuming that - /// this subtree gets vectorized, we may need to extract the values from the - /// roots. This method calculates the cost of extracting the values. - int getScalarizationCost(ArrayRef<Value *> VL); - - /// \brief Attempts to order and vectorize a sequence of stores. This - /// function does a quadratic scan of the given stores. - /// \returns true if the basic block was modified. - bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold); - - /// \brief Vectorize a group of scalars into a vector tree. - void vectorizeArith(ArrayRef<Value *> Operands); - - /// \returns the list of new instructions that were added in order to collect - /// scalars into vectors. This list can be used to further optimize the gather - /// sequences. - ValueList &getGatherSeqInstructions() {return GatherInstructions; } - -private: - /// \brief This method contains the recursive part of getTreeCost. - int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth); - - /// \brief This recursive method looks for vectorization hazards such as - /// values that are used by multiple users and checks that values are used - /// by only one vector lane. It updates the variables LaneMap, MultiUserVals. - void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth); - - /// \brief This method contains the recursive part of vectorizeTree. - Value *vectorizeTree_rec(ArrayRef<Value *> VL, int VF); - - /// \brief Number all of the instructions in the block. - void numberInstructions(); - - /// \brief Vectorize a sorted sequence of stores. - bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold); - - /// \returns the scalarization cost for this type. Scalarization in this - /// context means the creation of vectors from a group of scalars. - int getScalarizationCost(Type *Ty); - - /// \returns the AA location that is being access by the instruction. - AliasAnalysis::Location getLocation(Instruction *I); - - /// \brief Checks if it is possible to sink an instruction from - /// \p Src to \p Dst. - /// \returns the pointer to the barrier instruction if we can't sink. - Value *isUnsafeToSink(Instruction *Src, Instruction *Dst); - - /// \returns the instruction that appears last in the BB from \p VL. - /// Only consider the first \p VF elements. - Instruction *GetLastInstr(ArrayRef<Value *> VL, unsigned VF); - - /// \returns a vector from a collection of scalars in \p VL. - Value *Scalarize(ArrayRef<Value *> VL, VectorType *Ty); - -private: - /// Maps instructions to numbers and back. - SmallDenseMap<Value*, int> InstrIdx; - /// Maps integers to Instructions. - std::vector<Instruction*> InstrVec; - - // -- containers that are used during getTreeCost -- // - - /// Contains values that must be scalarized because they are used - /// by multiple lanes, or by users outside the tree. - /// NOTICE: The vectorization methods also use this set. - ValueSet MustScalarize; - - /// Contains a list of values that are used outside the current tree. This - /// set must be reset between runs. - ValueSet MultiUserVals; - /// Maps values in the tree to the vector lanes that uses them. This map must - /// be reset between runs of getCost. - std::map<Value*, int> LaneMap; - /// A list of instructions to ignore while sinking - /// memory instructions. This map must be reset between runs of getCost. - SmallPtrSet<Value *, 8> MemBarrierIgnoreList; - - // -- Containers that are used during vectorizeTree -- // - - /// Maps between the first scalar to the vector. This map must be reset - ///between runs. - DenseMap<Value*, Value*> VectorizedValues; - - // -- Containers that are used after vectorization by the caller -- // - - /// A list of instructions that are used when gathering scalars into vectors. - /// In many cases these instructions can be hoisted outside of the BB. - /// Iterating over this list is faster than calling LICM. - ValueList GatherInstructions; - - // Analysis and block reference. - BasicBlock *BB; - ScalarEvolution *SE; - DataLayout *DL; - TargetTransformInfo *TTI; - AliasAnalysis *AA; - Loop *L; -}; - -} // end of namespace - -#endif // LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H |